From e3f1687011d4de8f607906c23556cf1227519e08 Mon Sep 17 00:00:00 2001 From: Judah Rand <17158624+judahrand@users.noreply.github.com> Date: Tue, 21 Sep 2021 22:20:03 +0100 Subject: [PATCH 1/6] Write compliant Parquet with `pyarrow` if supported --- pandas/io/parquet.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index e92afd4e35ca1..0dfb6b6678074 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -13,6 +13,7 @@ FilePathOrBuffer, StorageOptions, ) +from pandas.compat import pa_version_under4p0 from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError from pandas.util._decorators import doc @@ -179,6 +180,12 @@ def write( mode="wb", is_dir=partition_cols is not None, ) + + # Output compliant Parquet if PyArrow supports it and the user hasn't + # explicitly set the desired behavior + if not pa_version_under4p0 and "use_compliant_nested_type" not in kwargs: + kwargs["use_compliant_nested_type"] = True + try: if partition_cols is not None: # writes to multiple files under the given path From 39010eb97661e2fec782a8a82eff685b48c3ebb1 Mon Sep 17 00:00:00 2001 From: Judah Rand <17158624+judahrand@users.noreply.github.com> Date: Wed, 22 Sep 2021 09:56:08 +0100 Subject: [PATCH 2/6] Import `pa_version_under5p0` in `compat` submodule --- pandas/compat/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 3233de8e3b6d1..c1fee9ad4121e 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -24,6 +24,7 @@ pa_version_under2p0, pa_version_under3p0, pa_version_under4p0, + pa_version_under5p0, ) PY39 = sys.version_info >= (3, 9) @@ -155,4 +156,5 @@ def get_lzma_file(lzma): "pa_version_under2p0", "pa_version_under3p0", "pa_version_under4p0", + "pa_version_under5p0", ] From cc7e6a89603cd675adc09e7bb7db55a3ab36f1c3 Mon Sep 17 00:00:00 2001 From: Judah Rand <17158624+judahrand@users.noreply.github.com> Date: Wed, 22 Sep 2021 11:26:33 +0100 Subject: [PATCH 3/6] Add whatsnew entry --- doc/source/whatsnew/v1.4.0.rst | 40 ++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 87f47dc65cea9..982312b278373 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -186,9 +186,45 @@ Now the float-dtype is respected. Since the common dtype for these DataFrames is res -.. _whatsnew_140.notable_bug_fixes.notable_bug_fix3: +.. _whatsnew_140.notable_bug_fixes.write_compliant_parquet_nested_type: -notable_bug_fix3 +Write compliant Parquet nested types if possible +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When using :meth:`DataFrame.to_parquet` to write a DataFrame to Parquet, if any of the columns contained arrays +of values the :mod:`pyarrow` engine would write a non-compliant format. This behavior is now fixed when the installed +version of PyArrow is at least ``4.0.0``. + +https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#nested-types + +.. ipython:: python + + import pandas as pd + import pyarrow.parquet as pq + + df = pd.DataFrame({"int_array_col": [[1, 2, 3], [4, 5, 6]]}) + df.to_parquet("/tmp/sample_df") + parquet_table = pq.read_table("/tmp/sample_df") + +*Previous behavior*: + +.. code-block:: ipython + + In [4]: parquet_table.schema.types + Out[4]: + [ListType(list)] + +*New behavior*: + +.. code-block:: ipython + + In [4]: parquet_table.schema.types + Out[4]: + [ListType(list)] + +.. _whatsnew_140.notable_bug_fixes.notable_bug_fix4: + +notable_bug_fix4 ^^^^^^^^^^^^^^^^ .. --------------------------------------------------------------------------- From 17b4d1794e60ae39127c663e5486b39067f1dc7a Mon Sep 17 00:00:00 2001 From: Judah Rand <17158624+judahrand@users.noreply.github.com> Date: Wed, 22 Sep 2021 19:10:50 +0100 Subject: [PATCH 4/6] Add test for compliant Parquet --- pandas/tests/io/test_parquet.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 01715ee133e96..b9c225cc98542 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -931,6 +931,17 @@ def test_read_parquet_manager(self, pa, using_array_manager): else: assert isinstance(result._mgr, pd.core.internals.BlockManager) + @td.skip_if_no("pyarrow", min_version="4.0.0") + def test_list_column_results_in_compliant_parquet(self, pa): + # https://github.com/pandas-dev/pandas/issues/43689 + df = pd.DataFrame({"a": [[1], [2]]}) + + with tm.ensure_clean() as path: + df.to_parquet(path, pa) + result = pyarrow.parquet.read_table(path) + + assert str(result.schema.field_by_name("a").type) == "list" + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full): From 5f06acdc240b3c90c23294364afc1b75aade9a74 Mon Sep 17 00:00:00 2001 From: Judah Rand <17158624+judahrand@users.noreply.github.com> Date: Wed, 22 Sep 2021 22:57:18 +0100 Subject: [PATCH 5/6] Add `check_round_trip` to test --- pandas/tests/io/test_parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b9c225cc98542..f263ca66b00a0 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -941,6 +941,7 @@ def test_list_column_results_in_compliant_parquet(self, pa): result = pyarrow.parquet.read_table(path) assert str(result.schema.field_by_name("a").type) == "list" + check_round_trip(df, pa) class TestParquetFastParquet(Base): From 4025014cfc642573fad1c8ebf2dbffd065b2455a Mon Sep 17 00:00:00 2001 From: Judah Rand <17158624+judahrand@users.noreply.github.com> Date: Thu, 30 Sep 2021 21:58:08 +0100 Subject: [PATCH 6/6] Replace long What's New entry with one liner --- doc/source/whatsnew/v1.4.0.rst | 41 +++------------------------------- 1 file changed, 3 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 982312b278373..a6eaa8f2f57f8 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -125,6 +125,7 @@ Other enhancements - Attempting to write into a file in missing parent directory with :meth:`DataFrame.to_csv`, :meth:`DataFrame.to_html`, :meth:`DataFrame.to_excel`, :meth:`DataFrame.to_feather`, :meth:`DataFrame.to_parquet`, :meth:`DataFrame.to_stata`, :meth:`DataFrame.to_json`, :meth:`DataFrame.to_pickle`, and :meth:`DataFrame.to_xml` now explicitly mentions missing parent directory, the same is true for :class:`Series` counterparts (:issue:`24306`) - :meth:`IntegerArray.all` , :meth:`IntegerArray.any`, :meth:`FloatingArray.any`, and :meth:`FloatingArray.all` use Kleene logic (:issue:`41967`) - Added support for nullable boolean and integer types in :meth:`DataFrame.to_stata`, :class:`~pandas.io.stata.StataWriter`, :class:`~pandas.io.stata.StataWriter117`, and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`40855`) +- :meth:`DataFrame.to_parquet` now writes Parquet compliant data for columns which contain lists or arrays when using PyArrow 4.0.0 or greater (:issue:`43689`) - .. --------------------------------------------------------------------------- @@ -186,45 +187,9 @@ Now the float-dtype is respected. Since the common dtype for these DataFrames is res -.. _whatsnew_140.notable_bug_fixes.write_compliant_parquet_nested_type: +.. _whatsnew_140.notable_bug_fixes.notable_bug_fix3: -Write compliant Parquet nested types if possible -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -When using :meth:`DataFrame.to_parquet` to write a DataFrame to Parquet, if any of the columns contained arrays -of values the :mod:`pyarrow` engine would write a non-compliant format. This behavior is now fixed when the installed -version of PyArrow is at least ``4.0.0``. - -https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#nested-types - -.. ipython:: python - - import pandas as pd - import pyarrow.parquet as pq - - df = pd.DataFrame({"int_array_col": [[1, 2, 3], [4, 5, 6]]}) - df.to_parquet("/tmp/sample_df") - parquet_table = pq.read_table("/tmp/sample_df") - -*Previous behavior*: - -.. code-block:: ipython - - In [4]: parquet_table.schema.types - Out[4]: - [ListType(list)] - -*New behavior*: - -.. code-block:: ipython - - In [4]: parquet_table.schema.types - Out[4]: - [ListType(list)] - -.. _whatsnew_140.notable_bug_fixes.notable_bug_fix4: - -notable_bug_fix4 +notable_bug_fix3 ^^^^^^^^^^^^^^^^ .. ---------------------------------------------------------------------------