Skip to content

ENH: DataFrame.to_parquet() returns bytes if path_or_buf not provided #37129

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Oct 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ Other enhancements
- :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`)
- :meth:`DataFrame.plot` now recognizes ``xlabel`` and ``ylabel`` arguments for plots of type ``scatter`` and ``hexbin`` (:issue:`37001`)
- :class:`DataFrame` now supports ``divmod`` operation (:issue:`37165`)
- :meth:`DataFrame.to_parquet` now returns a ``bytes`` object when no ``path`` argument is passed (:issue:`37105`)

.. _whatsnew_120.api_breaking.python:

Expand Down
17 changes: 11 additions & 6 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2289,14 +2289,14 @@ def to_markdown(
@deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
def to_parquet(
self,
path: FilePathOrBuffer[AnyStr],
path: Optional[FilePathOrBuffer] = None,
engine: str = "auto",
compression: Optional[str] = "snappy",
index: Optional[bool] = None,
partition_cols: Optional[List[str]] = None,
storage_options: StorageOptions = None,
**kwargs,
) -> None:
) -> Optional[bytes]:
"""
Write a DataFrame to the binary parquet format.

Expand All @@ -2307,14 +2307,15 @@ def to_parquet(

Parameters
----------
path : str or file-like object
path : str or file-like object, default None
If a string, it will be used as Root Directory path
when writing a partitioned dataset. By file-like object,
we refer to objects with a write() method, such as a file handle
(e.g. via builtin open function) or io.BytesIO. The engine
fastparquet does not accept file-like objects.
fastparquet does not accept file-like objects. If path is None,
a bytes object is returned.

.. versionchanged:: 1.0.0
.. versionchanged:: 1.2.0

Previously this was "fname"

Expand Down Expand Up @@ -2357,6 +2358,10 @@ def to_parquet(
Additional arguments passed to the parquet library. See
:ref:`pandas io <io.parquet>` for more details.

Returns
-------
bytes if no path argument is provided else None

See Also
--------
read_parquet : Read a parquet file.
Expand Down Expand Up @@ -2392,7 +2397,7 @@ def to_parquet(
"""
from pandas.io.parquet import to_parquet

to_parquet(
return to_parquet(
self,
path,
engine,
Expand Down
29 changes: 22 additions & 7 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
""" parquet compat """

import io
from typing import Any, AnyStr, Dict, List, Optional
from warnings import catch_warnings

Expand Down Expand Up @@ -238,28 +239,29 @@ def read(

def to_parquet(
df: DataFrame,
path: FilePathOrBuffer[AnyStr],
path: Optional[FilePathOrBuffer] = None,
engine: str = "auto",
compression: Optional[str] = "snappy",
index: Optional[bool] = None,
storage_options: StorageOptions = None,
partition_cols: Optional[List[str]] = None,
**kwargs,
):
) -> Optional[bytes]:
"""
Write a DataFrame to the parquet format.

Parameters
----------
df : DataFrame
path : str or file-like object
path : str or file-like object, default None
If a string, it will be used as Root Directory path
when writing a partitioned dataset. By file-like object,
we refer to objects with a write() method, such as a file handle
(e.g. via builtin open function) or io.BytesIO. The engine
fastparquet does not accept file-like objects.
fastparquet does not accept file-like objects. If path is None,
a bytes object is returned.

.. versionchanged:: 0.24.0
.. versionchanged:: 1.2.0

engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet library to use. If 'auto', then the option
Expand Down Expand Up @@ -298,20 +300,33 @@ def to_parquet(

kwargs
Additional keyword arguments passed to the engine

Returns
-------
bytes if no path argument is provided else None
"""
if isinstance(partition_cols, str):
partition_cols = [partition_cols]
impl = get_engine(engine)
return impl.write(

path_or_buf: FilePathOrBuffer = io.BytesIO() if path is None else path

impl.write(
df,
path,
path_or_buf,
compression=compression,
index=index,
partition_cols=partition_cols,
storage_options=storage_options,
**kwargs,
)

if path is None:
assert isinstance(path_or_buf, io.BytesIO)
return path_or_buf.getvalue()
else:
return None


def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
"""
Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,17 @@ def test_basic_subset_columns(self, pa, df_full):
read_kwargs={"columns": ["string", "int"]},
)

def test_to_bytes_without_path_or_buf_provided(self, pa, df_full):
# GH 37105

buf_bytes = df_full.to_parquet(engine=pa)
assert isinstance(buf_bytes, bytes)

buf_stream = BytesIO(buf_bytes)
res = pd.read_parquet(buf_stream)

tm.assert_frame_equal(df_full, res)

def test_duplicate_columns(self, pa):
# not currently able to handle duplicate columns
df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy()
Expand Down