From 681ac1ff0c9b0dd7bd63a4233785130a30eb8ca2 Mon Sep 17 00:00:00 2001 From: Ong Chin Hwee Date: Sat, 12 Sep 2020 20:29:23 +0800 Subject: [PATCH 1/9] ENH: Support MultiIndex columns in parquet (#34777) 1. Update check to handle MultiIndex columns for parquet format --- pandas/io/parquet.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 07f2078931687..dbb3a86adbc5c 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -53,9 +53,15 @@ def validate_dataframe(df: DataFrame): if not isinstance(df, DataFrame): raise ValueError("to_parquet only supports IO with DataFrames") - # must have value column names (strings only) - if df.columns.inferred_type not in {"string", "empty"}: - raise ValueError("parquet must have string column names") + # must have value column names for all index levels (strings only) + if df.columns.nlevels > 1: + if not all( + x.inferred_type in {"string", "empty"} for x in df.columns.levels + ): + raise ValueError("parquet must have string column names") + else: + if df.columns.inferred_type not in {"string", "empty"}: + raise ValueError("parquet must have string column names") # index level names must be strings valid_names = all( From a46e46ffe4d6f404b55553c6436d300f1bff0819 Mon Sep 17 00:00:00 2001 From: Ong Chin Hwee Date: Sat, 12 Sep 2020 20:52:36 +0800 Subject: [PATCH 2/9] ENH: Support MultiIndex columns in parquet GH34777 1. add whatsnew entry --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ba556c8dcca54..85bedfa337753 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -297,6 +297,7 @@ I/O - :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue: `35058`) - :meth:`to_csv` and :meth:`read_csv` did not honor `compression` and `encoding` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) - :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`) +- :meth:`to_parquet` did not support MultiIndex for columns in parquet format (:issue:`34777`) Plotting ^^^^^^^^ From e9ff7797d429f062d315ac86141a04083b1a89f7 Mon Sep 17 00:00:00 2001 From: Ong Chin Hwee Date: Sun, 13 Sep 2020 21:27:39 +0800 Subject: [PATCH 3/9] ENH: Support MultiIndex columns in parquet #34777 1. Update check to handle MultiIndex columns for parquet format 2. Edit whatsnew entry. 3. Add test for writing MultiIndex columns with string column names --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/io/parquet.py | 4 ++-- pandas/tests/io/test_parquet.py | 15 ++++++++++++++- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e957382969a83..a382df7570a69 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -297,7 +297,7 @@ I/O - :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue: `35058`) - :meth:`to_csv` and :meth:`read_csv` did not honor `compression` and `encoding` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) - :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`) -- :meth:`to_parquet` did not support MultiIndex for columns in parquet format (:issue:`34777`) +- :meth:`to_parquet` did not support :class:`MultiIndex` for columns in parquet format (:issue:`34777`) Plotting ^^^^^^^^ diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index dbb3a86adbc5c..d54b9bd7f3147 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -7,7 +7,7 @@ from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError -from pandas import DataFrame, get_option +from pandas import DataFrame, MultiIndex, get_option from pandas.io.common import get_filepath_or_buffer, is_fsspec_url, stringify_path @@ -54,7 +54,7 @@ def validate_dataframe(df: DataFrame): raise ValueError("to_parquet only supports IO with DataFrames") # must have value column names for all index levels (strings only) - if df.columns.nlevels > 1: + if isinstance(df.columns, MultiIndex): if not all( x.inferred_type in {"string", "empty"} for x in df.columns.levels ): diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 35a400cba8671..7dc0a4f938faa 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -410,11 +410,24 @@ def test_write_multiindex(self, pa): check_round_trip(df, engine) def test_write_column_multiindex(self, engine): - # column multi-index + # Not able to write column multi-indexes with non-string column names. mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns) self.check_error_on_write(df, engine, ValueError) + def test_write_column_multiindex_string(self, pa): + # Not supported in fastparquet as of 0.1.3 or older pyarrow version + engine = pa + + # Write column multi-indexes with string column names + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + df = pd.DataFrame(np.random.randn(8, 8), columns=arrays) + + check_round_trip(df, engine) + def test_multiindex_with_columns(self, pa): engine = pa dates = pd.date_range("01-Jan-2018", "01-Dec-2018", freq="MS") From 1b9e3f035099436e6e6f1e032158113ecbf1d833 Mon Sep 17 00:00:00 2001 From: Ong Chin Hwee Date: Sun, 13 Sep 2020 22:09:56 +0800 Subject: [PATCH 4/9] ENH: Support MultiIndex columns in parquet #34777 1. Include issue number as a comment on added test --- pandas/tests/io/test_parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 7dc0a4f938faa..6a41eda2d8cc9 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -416,6 +416,7 @@ def test_write_column_multiindex(self, engine): self.check_error_on_write(df, engine, ValueError) def test_write_column_multiindex_string(self, pa): + # GH #34777 # Not supported in fastparquet as of 0.1.3 or older pyarrow version engine = pa From 9e8f4ebd1cba08abd01440e62eb2cd1ac72177f4 Mon Sep 17 00:00:00 2001 From: chinhwee Date: Mon, 14 Sep 2020 09:19:37 +0000 Subject: [PATCH 5/9] ENH: Support MultiIndex columns in parquet #34777 1. Add tests for writing Indexes and MultiIndexes for columns 2. Edit message for check to handle MultiIndex columns for parquet 3. Edit whatsnew entry to move entry to other enhancements --- doc/source/whatsnew/v1.2.0.rst | 4 +- pandas/io/parquet.py | 4 +- pandas/tests/io/test_parquet.py | 80 ++++++++++++++++++++++++--------- 3 files changed, 65 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index a382df7570a69..091dc61466f4b 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -297,7 +297,6 @@ I/O - :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue: `35058`) - :meth:`to_csv` and :meth:`read_csv` did not honor `compression` and `encoding` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) - :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`) -- :meth:`to_parquet` did not support :class:`MultiIndex` for columns in parquet format (:issue:`34777`) Plotting ^^^^^^^^ @@ -343,7 +342,8 @@ Other ^^^^^ - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`) -- +- :meth:`to_parquet` support when using :class:`MultiIndex` for columns in parquet format enabled with pyarrow >= 0.15 (:issue:`34777`) + .. --------------------------------------------------------------------------- diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index d54b9bd7f3147..c160c0c59f5e8 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -58,7 +58,9 @@ def validate_dataframe(df: DataFrame): if not all( x.inferred_type in {"string", "empty"} for x in df.columns.levels ): - raise ValueError("parquet must have string column names") + raise ValueError( + "parquet must have string column names for all values in each level of the MultiIndex" + ) else: if df.columns.inferred_type not in {"string", "empty"}: raise ValueError("parquet must have string column names") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 6a41eda2d8cc9..577e07fbdc09c 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -409,26 +409,6 @@ def test_write_multiindex(self, pa): df.index = index check_round_trip(df, engine) - def test_write_column_multiindex(self, engine): - # Not able to write column multi-indexes with non-string column names. - mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) - df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns) - self.check_error_on_write(df, engine, ValueError) - - def test_write_column_multiindex_string(self, pa): - # GH #34777 - # Not supported in fastparquet as of 0.1.3 or older pyarrow version - engine = pa - - # Write column multi-indexes with string column names - arrays = [ - ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], - ["one", "two", "one", "two", "one", "two", "one", "two"], - ] - df = pd.DataFrame(np.random.randn(8, 8), columns=arrays) - - check_round_trip(df, engine) - def test_multiindex_with_columns(self, pa): engine = pa dates = pd.date_range("01-Jan-2018", "01-Dec-2018", freq="MS") @@ -477,6 +457,66 @@ def test_write_ignoring_index(self, engine): expected = df.reset_index(drop=True) check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) + def test_write_column_multiindex(self, engine): + # Not able to write column multi-indexes with non-string column names. + mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) + df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns) + self.check_error_on_write(df, engine, ValueError) + + def test_write_column_multiindex_nonstring(self, pa): + # GH #34777 + # Not supported in fastparquet as of 0.1.3 + engine = pa + + # Not able to write column multi-indexes with non-string column names + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + [1, 2, 1, 2, 1, 2, 1, 2], + ] + df = pd.DataFrame(np.random.randn(8, 8), columns=arrays) + df.columns.names = ["Level1", "Level2"] + + self.check_error_on_write(df, engine, ValueError) + + def test_write_column_multiindex_string(self, pa): + # GH #34777 + # Not supported in fastparquet as of 0.1.3 + engine = pa + + # Write column multi-indexes with string column names + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + df = pd.DataFrame(np.random.randn(8, 8), columns=arrays) + df.columns.names = ["ColLevel1", "ColLevel2"] + + check_round_trip(df, engine) + + def test_write_column_index_string(self, pa): + # GH #34777 + # Not supported in fastparquet as of 0.1.3 + engine = pa + + # Write column indexes with string column names + arrays = ["bar", "baz", "foo", "qux"] + df = pd.DataFrame(np.random.randn(8, 4), columns=arrays) + df.columns.name = "StringCol" + + check_round_trip(df, engine) + + def test_write_column_index_nonstring(self, pa): + # GH #34777 + # Not supported in fastparquet as of 0.1.3 + engine = pa + + # Write column indexes with string column names + arrays = [1, 2, 3, 4] + df = pd.DataFrame(np.random.randn(8, 4), columns=arrays) + df.columns.name = "NonStringCol" + + self.check_error_on_write(df, engine, ValueError) + class TestParquetPyArrow(Base): def test_basic(self, pa, df_full): From cc0f504da0c02964efa8f4df8ddb2bcbddf7f159 Mon Sep 17 00:00:00 2001 From: chinhwee Date: Mon, 14 Sep 2020 09:23:46 +0000 Subject: [PATCH 6/9] ENH: Support MultiIndex columns in parquet #34777 1. Fix PEP8 issue for error message in check for MultiIndex columns --- pandas/io/parquet.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index c160c0c59f5e8..e3bd13b3e2965 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -59,7 +59,10 @@ def validate_dataframe(df: DataFrame): x.inferred_type in {"string", "empty"} for x in df.columns.levels ): raise ValueError( - "parquet must have string column names for all values in each level of the MultiIndex" + """ + parquet must have string column names for all values in + each level of the MultiIndex + """ ) else: if df.columns.inferred_type not in {"string", "empty"}: From ed5fe60bed2fb4b8672196f759800ff947b3f595 Mon Sep 17 00:00:00 2001 From: Ong Chin Hwee Date: Sun, 11 Oct 2020 11:24:44 +0800 Subject: [PATCH 7/9] ENH: Support MultiIndex columns in parquet #34777 add whatsnew entry: enhancements in 1.2 --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2b4b10c39602a..73f81457d01a3 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -191,6 +191,7 @@ Other enhancements - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`) - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) +- :meth:`to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`) .. _whatsnew_120.api_breaking.python: @@ -484,7 +485,6 @@ Other - Fixed metadata propagation in the :class:`Series.dt` and :class:`Series.str` accessors (:issue:`28283`) - Bug in :meth:`Index.union` behaving differently depending on whether operand is a :class:`Index` or other list-like (:issue:`36384`) - Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError``, from a bare ``Exception`` previously (:issue:`35744`) - .. --------------------------------------------------------------------------- .. _whatsnew_120.contributors: From c859a4ff5bb73fd2e8afdf440e2d00f55677d83e Mon Sep 17 00:00:00 2001 From: chinhwee Date: Sun, 11 Oct 2020 04:16:40 +0000 Subject: [PATCH 8/9] fix doc failure --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 73f81457d01a3..2d7387f253fa7 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -485,6 +485,7 @@ Other - Fixed metadata propagation in the :class:`Series.dt` and :class:`Series.str` accessors (:issue:`28283`) - Bug in :meth:`Index.union` behaving differently depending on whether operand is a :class:`Index` or other list-like (:issue:`36384`) - Passing an array with 2 or more dimensions to the :class:`Series` constructor now raises the more specific ``ValueError``, from a bare ``Exception`` previously (:issue:`35744`) + .. --------------------------------------------------------------------------- .. _whatsnew_120.contributors: From 180ddff9e0d71ff5c3cbd21491fe95d21f844726 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Wed, 18 Nov 2020 16:37:18 +0100 Subject: [PATCH 9/9] Update doc/source/whatsnew/v1.2.0.rst --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 55cdc54daf834..36a7856c45678 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -221,7 +221,7 @@ Other enhancements - - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`) - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) -- :meth:`to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`) +- :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`) - Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`). - :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`) - :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welfords Method to avoid numerical issues (:issue:`37448`)