ENH: Support MultiIndex columns in parquet (#34777) (#36305)

hweecat · web-flow · commit 5f2bac5dfcf4 · 2020-11-18T21:09:09.000-05:00
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -238,6 +238,7 @@ Other enhancements
 -
 - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`)
 - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`)
+- :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`)
 - Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`).
 - :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`)
 - :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welfords Method to avoid numerical issues (:issue:`37448`)
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -9,7 +9,7 @@
 from pandas.compat._optional import import_optional_dependency
 from pandas.errors import AbstractMethodError
 
-from pandas import DataFrame, get_option
+from pandas import DataFrame, MultiIndex, get_option
 
 from pandas.io.common import IOHandles, get_handle, is_fsspec_url, stringify_path
 
@@ -89,9 +89,20 @@ def validate_dataframe(df: DataFrame):
         if not isinstance(df, DataFrame):
             raise ValueError("to_parquet only supports IO with DataFrames")
 
-        # must have value column names (strings only)
-        if df.columns.inferred_type not in {"string", "empty"}:
-            raise ValueError("parquet must have string column names")
+        # must have value column names for all index levels (strings only)
+        if isinstance(df.columns, MultiIndex):
+            if not all(
+                x.inferred_type in {"string", "empty"} for x in df.columns.levels
+            ):
+                raise ValueError(
+                    """
+                    parquet must have string column names for all values in
+                     each level of the MultiIndex
+                    """
+                )
+        else:
+            if df.columns.inferred_type not in {"string", "empty"}:
+                raise ValueError("parquet must have string column names")
 
         # index level names must be strings
         valid_names = all(
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -441,12 +441,6 @@ def test_write_multiindex(self, pa):
         df.index = index
         check_round_trip(df, engine)
 
-    def test_write_column_multiindex(self, engine):
-        # column multi-index
-        mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])
-        df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns)
-        self.check_error_on_write(df, engine, ValueError)
-
     def test_multiindex_with_columns(self, pa):
         engine = pa
         dates = pd.date_range("01-Jan-2018", "01-Dec-2018", freq="MS")
@@ -495,6 +489,66 @@ def test_write_ignoring_index(self, engine):
         expected = df.reset_index(drop=True)
         check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected)
 
+    def test_write_column_multiindex(self, engine):
+        # Not able to write column multi-indexes with non-string column names.
+        mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])
+        df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns)
+        self.check_error_on_write(df, engine, ValueError)
+
+    def test_write_column_multiindex_nonstring(self, pa):
+        # GH #34777
+        # Not supported in fastparquet as of 0.1.3
+        engine = pa
+
+        # Not able to write column multi-indexes with non-string column names
+        arrays = [
+            ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
+            [1, 2, 1, 2, 1, 2, 1, 2],
+        ]
+        df = pd.DataFrame(np.random.randn(8, 8), columns=arrays)
+        df.columns.names = ["Level1", "Level2"]
+
+        self.check_error_on_write(df, engine, ValueError)
+
+    def test_write_column_multiindex_string(self, pa):
+        # GH #34777
+        # Not supported in fastparquet as of 0.1.3
+        engine = pa
+
+        # Write column multi-indexes with string column names
+        arrays = [
+            ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
+            ["one", "two", "one", "two", "one", "two", "one", "two"],
+        ]
+        df = pd.DataFrame(np.random.randn(8, 8), columns=arrays)
+        df.columns.names = ["ColLevel1", "ColLevel2"]
+
+        check_round_trip(df, engine)
+
+    def test_write_column_index_string(self, pa):
+        # GH #34777
+        # Not supported in fastparquet as of 0.1.3
+        engine = pa
+
+        # Write column indexes with string column names
+        arrays = ["bar", "baz", "foo", "qux"]
+        df = pd.DataFrame(np.random.randn(8, 4), columns=arrays)
+        df.columns.name = "StringCol"
+
+        check_round_trip(df, engine)
+
+    def test_write_column_index_nonstring(self, pa):
+        # GH #34777
+        # Not supported in fastparquet as of 0.1.3
+        engine = pa
+
+        # Write column indexes with string column names
+        arrays = [1, 2, 3, 4]
+        df = pd.DataFrame(np.random.randn(8, 4), columns=arrays)
+        df.columns.name = "NonStringCol"
+
+        self.check_error_on_write(df, engine, ValueError)
+
 
 class TestParquetPyArrow(Base):
     def test_basic(self, pa, df_full):

Original file line number	Diff line number	Diff line change
`@@ -238,6 +238,7 @@ Other enhancements`
`238`	`238`	`-`
`239`	`239`	- Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`)
`240`	`240`	- Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`)
	`241`	+- :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`)
`241`	`242`	- Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`).
`242`	`243`	- :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`)
`243`	`244`	- :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welfords Method to avoid numerical issues (:issue:`37448`)