From d5020e17b5d3f2cf699ac075a5f516b312bb5c03 Mon Sep 17 00:00:00 2001 From: Abel Tavares Date: Sat, 13 Apr 2024 01:20:27 +0100 Subject: [PATCH] DOC: Add documentation on parquet categorical data handling --- pandas/core/frame.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b595e4d2158fc..0185ca8241617 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2877,9 +2877,16 @@ def to_parquet( Notes ----- - This function requires either the `fastparquet - `_ or `pyarrow - `_ library. + * This function requires either the `fastparquet + `_ or `pyarrow + `_ library. + * When saving a DataFrame with categorical columns to parquet, + the file size may increase due to the inclusion of all possible + categories, not just those present in the data. This behavior + is expected and consistent with pandas' handling of categorical data. + To manage file size and ensure a more predictable roundtrip process, + consider using :meth:`Categorical.remove_unused_categories` on the + DataFrame before saving. Examples --------