Skip to content

Commit 5f2bac5

Browse files
authored
ENH: Support MultiIndex columns in parquet (#34777) (#36305)
1 parent 6d5da02 commit 5f2bac5

File tree

3 files changed

+76
-10
lines changed

3 files changed

+76
-10
lines changed

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ Other enhancements
238238
-
239239
- Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`)
240240
- Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`)
241+
- :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`)
241242
- Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`).
242243
- :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`)
243244
- :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welfords Method to avoid numerical issues (:issue:`37448`)

pandas/io/parquet.py

+15-4
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from pandas.compat._optional import import_optional_dependency
1010
from pandas.errors import AbstractMethodError
1111

12-
from pandas import DataFrame, get_option
12+
from pandas import DataFrame, MultiIndex, get_option
1313

1414
from pandas.io.common import IOHandles, get_handle, is_fsspec_url, stringify_path
1515

@@ -89,9 +89,20 @@ def validate_dataframe(df: DataFrame):
8989
if not isinstance(df, DataFrame):
9090
raise ValueError("to_parquet only supports IO with DataFrames")
9191

92-
# must have value column names (strings only)
93-
if df.columns.inferred_type not in {"string", "empty"}:
94-
raise ValueError("parquet must have string column names")
92+
# must have value column names for all index levels (strings only)
93+
if isinstance(df.columns, MultiIndex):
94+
if not all(
95+
x.inferred_type in {"string", "empty"} for x in df.columns.levels
96+
):
97+
raise ValueError(
98+
"""
99+
parquet must have string column names for all values in
100+
each level of the MultiIndex
101+
"""
102+
)
103+
else:
104+
if df.columns.inferred_type not in {"string", "empty"}:
105+
raise ValueError("parquet must have string column names")
95106

96107
# index level names must be strings
97108
valid_names = all(

pandas/tests/io/test_parquet.py

+60-6
Original file line numberDiff line numberDiff line change
@@ -441,12 +441,6 @@ def test_write_multiindex(self, pa):
441441
df.index = index
442442
check_round_trip(df, engine)
443443

444-
def test_write_column_multiindex(self, engine):
445-
# column multi-index
446-
mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])
447-
df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns)
448-
self.check_error_on_write(df, engine, ValueError)
449-
450444
def test_multiindex_with_columns(self, pa):
451445
engine = pa
452446
dates = pd.date_range("01-Jan-2018", "01-Dec-2018", freq="MS")
@@ -495,6 +489,66 @@ def test_write_ignoring_index(self, engine):
495489
expected = df.reset_index(drop=True)
496490
check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected)
497491

492+
def test_write_column_multiindex(self, engine):
493+
# Not able to write column multi-indexes with non-string column names.
494+
mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])
495+
df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns)
496+
self.check_error_on_write(df, engine, ValueError)
497+
498+
def test_write_column_multiindex_nonstring(self, pa):
499+
# GH #34777
500+
# Not supported in fastparquet as of 0.1.3
501+
engine = pa
502+
503+
# Not able to write column multi-indexes with non-string column names
504+
arrays = [
505+
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
506+
[1, 2, 1, 2, 1, 2, 1, 2],
507+
]
508+
df = pd.DataFrame(np.random.randn(8, 8), columns=arrays)
509+
df.columns.names = ["Level1", "Level2"]
510+
511+
self.check_error_on_write(df, engine, ValueError)
512+
513+
def test_write_column_multiindex_string(self, pa):
514+
# GH #34777
515+
# Not supported in fastparquet as of 0.1.3
516+
engine = pa
517+
518+
# Write column multi-indexes with string column names
519+
arrays = [
520+
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
521+
["one", "two", "one", "two", "one", "two", "one", "two"],
522+
]
523+
df = pd.DataFrame(np.random.randn(8, 8), columns=arrays)
524+
df.columns.names = ["ColLevel1", "ColLevel2"]
525+
526+
check_round_trip(df, engine)
527+
528+
def test_write_column_index_string(self, pa):
529+
# GH #34777
530+
# Not supported in fastparquet as of 0.1.3
531+
engine = pa
532+
533+
# Write column indexes with string column names
534+
arrays = ["bar", "baz", "foo", "qux"]
535+
df = pd.DataFrame(np.random.randn(8, 4), columns=arrays)
536+
df.columns.name = "StringCol"
537+
538+
check_round_trip(df, engine)
539+
540+
def test_write_column_index_nonstring(self, pa):
541+
# GH #34777
542+
# Not supported in fastparquet as of 0.1.3
543+
engine = pa
544+
545+
# Write column indexes with string column names
546+
arrays = [1, 2, 3, 4]
547+
df = pd.DataFrame(np.random.randn(8, 4), columns=arrays)
548+
df.columns.name = "NonStringCol"
549+
550+
self.check_error_on_write(df, engine, ValueError)
551+
498552

499553
class TestParquetPyArrow(Base):
500554
def test_basic(self, pa, df_full):

0 commit comments

Comments
 (0)