diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 87f47dc65cea9..a6eaa8f2f57f8 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -125,6 +125,7 @@ Other enhancements - Attempting to write into a file in missing parent directory with :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_html`, :meth:`DataFrame.to_excel`, :meth:`DataFrame.to_feather`, :meth:`DataFrame.to_parquet`, :meth:`DataFrame.to_stata`, :meth:`DataFrame.to_json`, :meth:`DataFrame.to_pickle`, and :meth:`DataFrame.to_xml` now explicitly mentions missing parent directory, the same is true for :class:`Series` counterparts (:issue:`24306`) - :meth:`IntegerArray.all` , :meth:`IntegerArray.any`, :meth:`FloatingArray.any`, and :meth:`FloatingArray.all` use Kleene logic (:issue:`41967`) - Added support for nullable boolean and integer types in :meth:`DataFrame.to_stata`, :class:`~pandas.io.stata.StataWriter`, :class:`~pandas.io.stata.StataWriter117`, and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`40855`) +- :meth:`DataFrame.to_parquet` now writes Parquet compliant data for columns which contain lists or arrays when using PyArrow 4.0.0 or greater (:issue:`43689`) - .. --------------------------------------------------------------------------- diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 3233de8e3b6d1..c1fee9ad4121e 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -24,6 +24,7 @@ pa_version_under2p0, pa_version_under3p0, pa_version_under4p0, + pa_version_under5p0, ) PY39 = sys.version_info >= (3, 9) @@ -155,4 +156,5 @@ def get_lzma_file(lzma): "pa_version_under2p0", "pa_version_under3p0", "pa_version_under4p0", + "pa_version_under5p0", ] diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index e92afd4e35ca1..0dfb6b6678074 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -13,6 +13,7 @@ FilePathOrBuffer, StorageOptions, ) +from pandas.compat import pa_version_under4p0 from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError from pandas.util._decorators import doc @@ -179,6 +180,12 @@ def write( mode="wb", is_dir=partition_cols is not None, ) + + # Output compliant Parquet if PyArrow supports it and the user hasn't + # explicitly set the desired behavior + if not pa_version_under4p0 and "use_compliant_nested_type" not in kwargs: + kwargs["use_compliant_nested_type"] = True + try: if partition_cols is not None: # writes to multiple files under the given path diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 01715ee133e96..f263ca66b00a0 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -931,6 +931,18 @@ def test_read_parquet_manager(self, pa, using_array_manager): else: assert isinstance(result._mgr, pd.core.internals.BlockManager) + @td.skip_if_no("pyarrow", min_version="4.0.0") + def test_list_column_results_in_compliant_parquet(self, pa): + # https://github.com/pandas-dev/pandas/issues/43689 + df = pd.DataFrame({"a": [[1], [2]]}) + + with tm.ensure_clean() as path: + df.to_parquet(path, pa) + result = pyarrow.parquet.read_table(path) + + assert str(result.schema.field_by_name("a").type) == "list" + check_round_trip(df, pa) + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full):