diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 454098f4ace04..a3b5ba616b258 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -238,6 +238,7 @@ Other enhancements - - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`) - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`) +- :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`) - Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`). - :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`) - :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welfords Method to avoid numerical issues (:issue:`37448`) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 1f90da2f57579..10c70b9a5c43a 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -9,7 +9,7 @@ from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError -from pandas import DataFrame, get_option +from pandas import DataFrame, MultiIndex, get_option from pandas.io.common import IOHandles, get_handle, is_fsspec_url, stringify_path @@ -89,9 +89,20 @@ def validate_dataframe(df: DataFrame): if not isinstance(df, DataFrame): raise ValueError("to_parquet only supports IO with DataFrames") - # must have value column names (strings only) - if df.columns.inferred_type not in {"string", "empty"}: - raise ValueError("parquet must have string column names") + # must have value column names for all index levels (strings only) + if isinstance(df.columns, MultiIndex): + if not all( + x.inferred_type in {"string", "empty"} for x in df.columns.levels + ): + raise ValueError( + """ + parquet must have string column names for all values in + each level of the MultiIndex + """ + ) + else: + if df.columns.inferred_type not in {"string", "empty"}: + raise ValueError("parquet must have string column names") # index level names must be strings valid_names = all( diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index e5fffb0e3a3e8..3b83eed69c723 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -441,12 +441,6 @@ def test_write_multiindex(self, pa): df.index = index check_round_trip(df, engine) - def test_write_column_multiindex(self, engine): - # column multi-index - mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) - df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns) - self.check_error_on_write(df, engine, ValueError) - def test_multiindex_with_columns(self, pa): engine = pa dates = pd.date_range("01-Jan-2018", "01-Dec-2018", freq="MS") @@ -495,6 +489,66 @@ def test_write_ignoring_index(self, engine): expected = df.reset_index(drop=True) check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected) + def test_write_column_multiindex(self, engine): + # Not able to write column multi-indexes with non-string column names. + mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) + df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns) + self.check_error_on_write(df, engine, ValueError) + + def test_write_column_multiindex_nonstring(self, pa): + # GH #34777 + # Not supported in fastparquet as of 0.1.3 + engine = pa + + # Not able to write column multi-indexes with non-string column names + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + [1, 2, 1, 2, 1, 2, 1, 2], + ] + df = pd.DataFrame(np.random.randn(8, 8), columns=arrays) + df.columns.names = ["Level1", "Level2"] + + self.check_error_on_write(df, engine, ValueError) + + def test_write_column_multiindex_string(self, pa): + # GH #34777 + # Not supported in fastparquet as of 0.1.3 + engine = pa + + # Write column multi-indexes with string column names + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + df = pd.DataFrame(np.random.randn(8, 8), columns=arrays) + df.columns.names = ["ColLevel1", "ColLevel2"] + + check_round_trip(df, engine) + + def test_write_column_index_string(self, pa): + # GH #34777 + # Not supported in fastparquet as of 0.1.3 + engine = pa + + # Write column indexes with string column names + arrays = ["bar", "baz", "foo", "qux"] + df = pd.DataFrame(np.random.randn(8, 4), columns=arrays) + df.columns.name = "StringCol" + + check_round_trip(df, engine) + + def test_write_column_index_nonstring(self, pa): + # GH #34777 + # Not supported in fastparquet as of 0.1.3 + engine = pa + + # Write column indexes with string column names + arrays = [1, 2, 3, 4] + df = pd.DataFrame(np.random.randn(8, 4), columns=arrays) + df.columns.name = "NonStringCol" + + self.check_error_on_write(df, engine, ValueError) + class TestParquetPyArrow(Base): def test_basic(self, pa, df_full):