From d5020e17b5d3f2cf699ac075a5f516b312bb5c03 Mon Sep 17 00:00:00 2001
From: Abel Tavares <abel.tavares@ctw.bmwgroup.com>
Date: Sat, 13 Apr 2024 01:20:27 +0100
Subject: [PATCH] DOC: Add documentation on parquet categorical data handling

---
 pandas/core/frame.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index b595e4d2158fc..0185ca8241617 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2877,9 +2877,16 @@ def to_parquet(
 
         Notes
         -----
-        This function requires either the `fastparquet
-        <https://pypi.org/project/fastparquet>`_ or `pyarrow
-        <https://arrow.apache.org/docs/python/>`_ library.
+        * This function requires either the `fastparquet
+          <https://pypi.org/project/fastparquet>`_ or `pyarrow
+          <https://arrow.apache.org/docs/python/>`_ library.
+        * When saving a DataFrame with categorical columns to parquet,
+          the file size may increase due to the inclusion of all possible
+          categories, not just those present in the data. This behavior
+          is expected and consistent with pandas' handling of categorical data.
+          To manage file size and ensure a more predictable roundtrip process,
+          consider using :meth:`Categorical.remove_unused_categories` on the
+          DataFrame before saving.
 
         Examples
         --------