From 16b75bddf1d0050f2b53831609aef1d8f463f7ca Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Tue, 1 Aug 2023 04:25:06 +0000 Subject: [PATCH 1/8] added df.attrs metadata to pyarrow table for persistence --- pandas/io/parquet.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 61112542fb9d8..4f352d621c0e7 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -1,7 +1,9 @@ """ parquet compat """ from __future__ import annotations +import ast import io +import json import os from typing import ( TYPE_CHECKING, @@ -184,6 +186,11 @@ def write( table = self.api.Table.from_pandas(df, **from_pandas_kwargs) + df_metadata = {"df.attrs": json.dumps(df.attrs)} + existing_metadata = table.schema.metadata + merged_metadata = {**existing_metadata, **df_metadata} + table = table.replace_schema_metadata(merged_metadata) + path_or_handle, handles, filesystem = _get_path_or_handle( path, filesystem, @@ -263,6 +270,10 @@ def read( if manager == "array": result = result._as_manager("array", copy=False) + + result.attrs = ast.literal_eval( + pa_table.schema.metadata[b"df.attrs"].decode("utf-8") + ) return result finally: if handles is not None: From 9e92d135be8df39f59af08ffd8dfee316dde0681 Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Tue, 1 Aug 2023 04:47:27 +0000 Subject: [PATCH 2/8] hooks --- pandas/io/parquet.py | 7 ++++--- pandas/tests/io/test_parquet.py | 8 ++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 4f352d621c0e7..ef35be41e007c 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -271,9 +271,10 @@ def read( if manager == "array": result = result._as_manager("array", copy=False) - result.attrs = ast.literal_eval( - pa_table.schema.metadata[b"df.attrs"].decode("utf-8") - ) + if pa_table.schema.metadata: + if b"df.attrs" in pa_table.schema.metadata: + df_metadata = pa_table.schema.metadata[b"df.attrs"] + result.attrs = ast.literal_eval(df_metadata.decode("utf-8")) return result finally: if handles is not None: diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 0d8afbf220b0c..4dc96520b2dbc 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1192,6 +1192,14 @@ def test_partition_on_supported(self, tmp_path, fp, df_full): actual_partition_cols = fastparquet.ParquetFile(str(tmp_path), False).cats assert len(actual_partition_cols) == 2 + def test_df_attrs_persistence(self, tmp_path): + path = tmp_path / "test_df_metadata.p" + df = pd.DataFrame(data={1: [1]}) + df.attrs = {"Test attribute": 1} + df.to_parquet(path) + new_df = read_parquet(path) + assert new_df.attrs == df.attrs + def test_error_on_using_partition_cols_and_partition_on( self, tmp_path, fp, df_full ): From f974af1617b113f7479ed84668b5990714f5f95f Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Wed, 2 Aug 2023 19:52:25 +0000 Subject: [PATCH 3/8] placed unit test in correct class --- pandas/tests/io/test_parquet.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 4dc96520b2dbc..7bdd2e3c0daf3 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1065,6 +1065,14 @@ def test_empty_columns(self, pa): df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")) check_round_trip(df, pa) + def test_df_attrs_persistence(self, tmp_path, pa): + path = tmp_path / "test_df_metadata.p" + df = pd.DataFrame(data={1: [1]}) + df.attrs = {"Test attribute": 1} + df.to_parquet(path, engine=pa) + new_df = read_parquet(path, engine=pa) + assert new_df.attrs == df.attrs + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full): @@ -1192,14 +1200,6 @@ def test_partition_on_supported(self, tmp_path, fp, df_full): actual_partition_cols = fastparquet.ParquetFile(str(tmp_path), False).cats assert len(actual_partition_cols) == 2 - def test_df_attrs_persistence(self, tmp_path): - path = tmp_path / "test_df_metadata.p" - df = pd.DataFrame(data={1: [1]}) - df.attrs = {"Test attribute": 1} - df.to_parquet(path) - new_df = read_parquet(path) - assert new_df.attrs == df.attrs - def test_error_on_using_partition_cols_and_partition_on( self, tmp_path, fp, df_full ): From b559ca8b00ab25d476006033c5dd1aeca5a0a50b Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Wed, 2 Aug 2023 23:23:47 +0000 Subject: [PATCH 4/8] update unit test --- pandas/tests/io/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 7bdd2e3c0daf3..564fa3447e7ab 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1068,7 +1068,7 @@ def test_empty_columns(self, pa): def test_df_attrs_persistence(self, tmp_path, pa): path = tmp_path / "test_df_metadata.p" df = pd.DataFrame(data={1: [1]}) - df.attrs = {"Test attribute": 1} + df.attrs = {"test_attribute": 1} df.to_parquet(path, engine=pa) new_df = read_parquet(path, engine=pa) assert new_df.attrs == df.attrs From e6a34e65160a4e3ea258fe0f48856e088af231dd Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Thu, 3 Aug 2023 03:39:53 +0000 Subject: [PATCH 5/8] changed to consistent use of json --- pandas/io/parquet.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index ef35be41e007c..0abba428c9c87 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -1,7 +1,6 @@ """ parquet compat """ from __future__ import annotations -import ast import io import json import os @@ -186,7 +185,7 @@ def write( table = self.api.Table.from_pandas(df, **from_pandas_kwargs) - df_metadata = {"df.attrs": json.dumps(df.attrs)} + df_metadata = {"PANDAS_ATTRS": json.dumps(df.attrs)} existing_metadata = table.schema.metadata merged_metadata = {**existing_metadata, **df_metadata} table = table.replace_schema_metadata(merged_metadata) @@ -272,9 +271,9 @@ def read( result = result._as_manager("array", copy=False) if pa_table.schema.metadata: - if b"df.attrs" in pa_table.schema.metadata: - df_metadata = pa_table.schema.metadata[b"df.attrs"] - result.attrs = ast.literal_eval(df_metadata.decode("utf-8")) + if b"PANDAS_ATTRS" in pa_table.schema.metadata: + df_metadata = pa_table.schema.metadata[b"PANDAS_ATTRS"] + result.attrs = json.loads(df_metadata) return result finally: if handles is not None: From e3f2f2a3cc12b7b120c1c54812453d58a01e9bf9 Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Thu, 3 Aug 2023 03:40:16 +0000 Subject: [PATCH 6/8] added whatsnew --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 98465b5686ca1..ab88b968eac5b 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -176,8 +176,8 @@ Other enhancements - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) - Reductions :meth:`Series.argmax`, :meth:`Series.argmin`, :meth:`Series.idxmax`, :meth:`Series.idxmin`, :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`DataFrame.idxmax`, :meth:`DataFrame.idxmin` are now supported for object-dtype objects (:issue:`4279`, :issue:`18021`, :issue:`40685`, :issue:`43697`) +- Added ``PANDAS_ATTRS`` to :attr:`Schema.metadata` in :class:`PyArrowImpl` for parquet metadata persistence using pyarrow engine (:issue:`54346`) - Performance improvement in :meth:`GroupBy.quantile` (:issue:`51722`) -- .. --------------------------------------------------------------------------- .. _whatsnew_210.notable_bug_fixes: From 66e3992406171943655e84f15a18c79efb648edb Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Thu, 3 Aug 2023 03:43:42 +0000 Subject: [PATCH 7/8] added gaurd to check if df.attrs exists --- pandas/io/parquet.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 0abba428c9c87..ef8ed915b0ecc 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -185,10 +185,11 @@ def write( table = self.api.Table.from_pandas(df, **from_pandas_kwargs) - df_metadata = {"PANDAS_ATTRS": json.dumps(df.attrs)} - existing_metadata = table.schema.metadata - merged_metadata = {**existing_metadata, **df_metadata} - table = table.replace_schema_metadata(merged_metadata) + if df.attrs: + df_metadata = {"PANDAS_ATTRS": json.dumps(df.attrs)} + existing_metadata = table.schema.metadata + merged_metadata = {**existing_metadata, **df_metadata} + table = table.replace_schema_metadata(merged_metadata) path_or_handle, handles, filesystem = _get_path_or_handle( path, From 6a96e363cf84d5edeac16ceee824f191c81a3360 Mon Sep 17 00:00:00 2001 From: Sanjith Chockan Date: Thu, 3 Aug 2023 17:42:32 +0000 Subject: [PATCH 8/8] updated whatsnew --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index ab88b968eac5b..22a9366ae0752 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -176,7 +176,7 @@ Other enhancements - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) - Reductions :meth:`Series.argmax`, :meth:`Series.argmin`, :meth:`Series.idxmax`, :meth:`Series.idxmin`, :meth:`Index.argmax`, :meth:`Index.argmin`, :meth:`DataFrame.idxmax`, :meth:`DataFrame.idxmin` are now supported for object-dtype objects (:issue:`4279`, :issue:`18021`, :issue:`40685`, :issue:`43697`) -- Added ``PANDAS_ATTRS`` to :attr:`Schema.metadata` in :class:`PyArrowImpl` for parquet metadata persistence using pyarrow engine (:issue:`54346`) +- :meth:`DataFrame.to_parquet` and :func:`read_parquet` will now write and read ``attrs`` respectively (:issue:`54346`) - Performance improvement in :meth:`GroupBy.quantile` (:issue:`51722`) .. ---------------------------------------------------------------------------