diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e4d97168692b3..28c86015fb7b6 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -221,6 +221,7 @@ Other enhancements - :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`) - :meth:`DataFrame.plot` now recognizes ``xlabel`` and ``ylabel`` arguments for plots of type ``scatter`` and ``hexbin`` (:issue:`37001`) - :class:`DataFrame` now supports ``divmod`` operation (:issue:`37165`) +- :meth:`DataFrame.to_parquet` now returns a ``bytes`` object when no ``path`` argument is passed (:issue:`37105`) .. _whatsnew_120.api_breaking.python: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 285d4fc34cf98..5729d632a64ec 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2289,14 +2289,14 @@ def to_markdown( @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_parquet( self, - path: FilePathOrBuffer[AnyStr], + path: Optional[FilePathOrBuffer] = None, engine: str = "auto", compression: Optional[str] = "snappy", index: Optional[bool] = None, partition_cols: Optional[List[str]] = None, storage_options: StorageOptions = None, **kwargs, - ) -> None: + ) -> Optional[bytes]: """ Write a DataFrame to the binary parquet format. @@ -2307,14 +2307,15 @@ def to_parquet( Parameters ---------- - path : str or file-like object + path : str or file-like object, default None If a string, it will be used as Root Directory path when writing a partitioned dataset. By file-like object, we refer to objects with a write() method, such as a file handle (e.g. via builtin open function) or io.BytesIO. The engine - fastparquet does not accept file-like objects. + fastparquet does not accept file-like objects. If path is None, + a bytes object is returned. - .. versionchanged:: 1.0.0 + .. versionchanged:: 1.2.0 Previously this was "fname" @@ -2357,6 +2358,10 @@ def to_parquet( Additional arguments passed to the parquet library. See :ref:`pandas io ` for more details. + Returns + ------- + bytes if no path argument is provided else None + See Also -------- read_parquet : Read a parquet file. @@ -2392,7 +2397,7 @@ def to_parquet( """ from pandas.io.parquet import to_parquet - to_parquet( + return to_parquet( self, path, engine, diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 97ec0ed1f7fdc..88f57e18593f2 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -1,5 +1,6 @@ """ parquet compat """ +import io from typing import Any, AnyStr, Dict, List, Optional from warnings import catch_warnings @@ -238,28 +239,29 @@ def read( def to_parquet( df: DataFrame, - path: FilePathOrBuffer[AnyStr], + path: Optional[FilePathOrBuffer] = None, engine: str = "auto", compression: Optional[str] = "snappy", index: Optional[bool] = None, storage_options: StorageOptions = None, partition_cols: Optional[List[str]] = None, **kwargs, -): +) -> Optional[bytes]: """ Write a DataFrame to the parquet format. Parameters ---------- df : DataFrame - path : str or file-like object + path : str or file-like object, default None If a string, it will be used as Root Directory path when writing a partitioned dataset. By file-like object, we refer to objects with a write() method, such as a file handle (e.g. via builtin open function) or io.BytesIO. The engine - fastparquet does not accept file-like objects. + fastparquet does not accept file-like objects. If path is None, + a bytes object is returned. - .. versionchanged:: 0.24.0 + .. versionchanged:: 1.2.0 engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' Parquet library to use. If 'auto', then the option @@ -298,13 +300,20 @@ def to_parquet( kwargs Additional keyword arguments passed to the engine + + Returns + ------- + bytes if no path argument is provided else None """ if isinstance(partition_cols, str): partition_cols = [partition_cols] impl = get_engine(engine) - return impl.write( + + path_or_buf: FilePathOrBuffer = io.BytesIO() if path is None else path + + impl.write( df, - path, + path_or_buf, compression=compression, index=index, partition_cols=partition_cols, @@ -312,6 +321,12 @@ def to_parquet( **kwargs, ) + if path is None: + assert isinstance(path_or_buf, io.BytesIO) + return path_or_buf.getvalue() + else: + return None + def read_parquet(path, engine: str = "auto", columns=None, **kwargs): """ diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 8d3d4cc347019..285601b37b80f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -512,6 +512,17 @@ def test_basic_subset_columns(self, pa, df_full): read_kwargs={"columns": ["string", "int"]}, ) + def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): + # GH 37105 + + buf_bytes = df_full.to_parquet(engine=pa) + assert isinstance(buf_bytes, bytes) + + buf_stream = BytesIO(buf_bytes) + res = pd.read_parquet(buf_stream) + + tm.assert_frame_equal(df_full, res) + def test_duplicate_columns(self, pa): # not currently able to handle duplicate columns df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy()