Skip to content

Commit 152595c

Browse files
Parquet metadata persistence of DataFrame.attrs (#54346)
* added df.attrs metadata to pyarrow table for persistence * hooks * placed unit test in correct class * update unit test * changed to consistent use of json * added whatsnew * added gaurd to check if df.attrs exists * updated whatsnew
1 parent eaddc1d commit 152595c

File tree

3 files changed

+21
-1
lines changed

3 files changed

+21
-1
lines changed

doc/source/whatsnew/v2.1.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -212,8 +212,8 @@ Other enhancements
212212
- Improved error message when :meth:`DataFrameGroupBy.agg` failed (:issue:`52930`)
213213
- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`)
214214
- Reductions :meth:`Series.argmax`, :meth:`Series.argmin`, :meth:`Series.idxmax`, :meth:`Series.idxmin`, :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`DataFrame.idxmax`, :meth:`DataFrame.idxmin` are now supported for object-dtype objects (:issue:`4279`, :issue:`18021`, :issue:`40685`, :issue:`43697`)
215+
- :meth:`DataFrame.to_parquet` and :func:`read_parquet` will now write and read ``attrs`` respectively (:issue:`54346`)
215216
- Performance improvement in :meth:`GroupBy.quantile` (:issue:`51722`)
216-
-
217217

218218
.. ---------------------------------------------------------------------------
219219
.. _whatsnew_210.notable_bug_fixes:

pandas/io/parquet.py

+12
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from __future__ import annotations
33

44
import io
5+
import json
56
import os
67
from typing import (
78
TYPE_CHECKING,
@@ -184,6 +185,12 @@ def write(
184185

185186
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
186187

188+
if df.attrs:
189+
df_metadata = {"PANDAS_ATTRS": json.dumps(df.attrs)}
190+
existing_metadata = table.schema.metadata
191+
merged_metadata = {**existing_metadata, **df_metadata}
192+
table = table.replace_schema_metadata(merged_metadata)
193+
187194
path_or_handle, handles, filesystem = _get_path_or_handle(
188195
path,
189196
filesystem,
@@ -268,6 +275,11 @@ def read(
268275

269276
if manager == "array":
270277
result = result._as_manager("array", copy=False)
278+
279+
if pa_table.schema.metadata:
280+
if b"PANDAS_ATTRS" in pa_table.schema.metadata:
281+
df_metadata = pa_table.schema.metadata[b"PANDAS_ATTRS"]
282+
result.attrs = json.loads(df_metadata)
271283
return result
272284
finally:
273285
if handles is not None:

pandas/tests/io/test_parquet.py

+8
Original file line numberDiff line numberDiff line change
@@ -1098,6 +1098,14 @@ def test_empty_columns(self, pa):
10981098
df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name"))
10991099
check_round_trip(df, pa)
11001100

1101+
def test_df_attrs_persistence(self, tmp_path, pa):
1102+
path = tmp_path / "test_df_metadata.p"
1103+
df = pd.DataFrame(data={1: [1]})
1104+
df.attrs = {"test_attribute": 1}
1105+
df.to_parquet(path, engine=pa)
1106+
new_df = read_parquet(path, engine=pa)
1107+
assert new_df.attrs == df.attrs
1108+
11011109

11021110
class TestParquetFastParquet(Base):
11031111
def test_basic(self, fp, df_full):

0 commit comments

Comments
 (0)