Skip to content

ENH: Support MultiIndex columns in parquet (#34777) #36305

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Nov 19, 2020
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
681ac1f
ENH: Support MultiIndex columns in parquet (#34777)
hweecat Sep 12, 2020
a46e46f
ENH: Support MultiIndex columns in parquet GH34777
hweecat Sep 12, 2020
c974259
Merge remote-tracking branch 'upstream/master' into io-parquet-multii…
hweecat Sep 12, 2020
e9ff779
ENH: Support MultiIndex columns in parquet #34777
hweecat Sep 13, 2020
1b9e3f0
ENH: Support MultiIndex columns in parquet #34777
hweecat Sep 13, 2020
9e8f4eb
ENH: Support MultiIndex columns in parquet #34777
hweecat Sep 14, 2020
cc0f504
ENH: Support MultiIndex columns in parquet #34777
hweecat Sep 14, 2020
3ba38fa
Merge remote-tracking branch 'upstream/master' into io-parquet-multii…
hweecat Sep 14, 2020
a4131d2
Merge remote-tracking branch 'upstream/master' into io-parquet-multii…
hweecat Sep 25, 2020
26966b7
Merge remote-tracking branch 'upstream/master' into io-parquet-multii…
hweecat Oct 1, 2020
cc8e85c
Merge remote-tracking branch 'upstream/master' into io-parquet-multii…
hweecat Oct 10, 2020
3b9b52a
Merge remote-tracking branch 'upstream/master' into io-parquet-multii…
hweecat Oct 11, 2020
ed5fe60
ENH: Support MultiIndex columns in parquet #34777
hweecat Oct 11, 2020
c859a4f
fix doc failure
hweecat Oct 11, 2020
039094c
Merge remote-tracking branch 'upstream/master' into io-parquet-multii…
hweecat Nov 11, 2020
167ae69
Merge remote-tracking branch 'upstream/master' into io-parquet-multii…
hweecat Nov 14, 2020
2e4fc58
Merge remote-tracking branch 'upstream/master' into io-parquet-multii…
hweecat Nov 15, 2020
180ddff
Update doc/source/whatsnew/v1.2.0.rst
charlesdong1991 Nov 18, 2020
234009b
Merge remote-tracking branch 'upstream/master' into io-parquet-multii…
hweecat Nov 18, 2020
ab24628
Merge remote-tracking branch 'upstream/master' into io-parquet-multii…
hweecat Nov 19, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pandas.compat._optional import import_optional_dependency
from pandas.errors import AbstractMethodError

from pandas import DataFrame, get_option
from pandas import DataFrame, MultiIndex, get_option

from pandas.io.common import get_filepath_or_buffer, is_fsspec_url, stringify_path

Expand Down Expand Up @@ -53,9 +53,20 @@ def validate_dataframe(df: DataFrame):
if not isinstance(df, DataFrame):
raise ValueError("to_parquet only supports IO with DataFrames")

# must have value column names (strings only)
if df.columns.inferred_type not in {"string", "empty"}:
raise ValueError("parquet must have string column names")
# must have value column names for all index levels (strings only)
if isinstance(df.columns, MultiIndex):
if not all(
x.inferred_type in {"string", "empty"} for x in df.columns.levels
):
raise ValueError(
"""
parquet must have string column names for all values in
each level of the MultiIndex
"""
)
else:
if df.columns.inferred_type not in {"string", "empty"}:
raise ValueError("parquet must have string column names")

# index level names must be strings
valid_names = all(
Expand Down
66 changes: 60 additions & 6 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,12 +430,6 @@ def test_write_multiindex(self, pa):
df.index = index
check_round_trip(df, engine)

def test_write_column_multiindex(self, engine):
# column multi-index
mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])
df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns)
self.check_error_on_write(df, engine, ValueError)

def test_multiindex_with_columns(self, pa):
engine = pa
dates = pd.date_range("01-Jan-2018", "01-Dec-2018", freq="MS")
Expand Down Expand Up @@ -484,6 +478,66 @@ def test_write_ignoring_index(self, engine):
expected = df.reset_index(drop=True)
check_round_trip(df, engine, write_kwargs=write_kwargs, expected=expected)

def test_write_column_multiindex(self, engine):
# Not able to write column multi-indexes with non-string column names.
mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])
df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns)
self.check_error_on_write(df, engine, ValueError)

def test_write_column_multiindex_nonstring(self, pa):
# GH #34777
# Not supported in fastparquet as of 0.1.3
engine = pa

# Not able to write column multi-indexes with non-string column names
arrays = [
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
[1, 2, 1, 2, 1, 2, 1, 2],
]
df = pd.DataFrame(np.random.randn(8, 8), columns=arrays)
df.columns.names = ["Level1", "Level2"]

self.check_error_on_write(df, engine, ValueError)

def test_write_column_multiindex_string(self, pa):
# GH #34777
# Not supported in fastparquet as of 0.1.3
engine = pa

# Write column multi-indexes with string column names
arrays = [
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
["one", "two", "one", "two", "one", "two", "one", "two"],
]
df = pd.DataFrame(np.random.randn(8, 8), columns=arrays)
df.columns.names = ["ColLevel1", "ColLevel2"]

check_round_trip(df, engine)

def test_write_column_index_string(self, pa):
# GH #34777
# Not supported in fastparquet as of 0.1.3
engine = pa

# Write column indexes with string column names
arrays = ["bar", "baz", "foo", "qux"]
df = pd.DataFrame(np.random.randn(8, 4), columns=arrays)
df.columns.name = "StringCol"

check_round_trip(df, engine)

def test_write_column_index_nonstring(self, pa):
# GH #34777
# Not supported in fastparquet as of 0.1.3
engine = pa

# Write column indexes with string column names
arrays = [1, 2, 3, 4]
df = pd.DataFrame(np.random.randn(8, 4), columns=arrays)
df.columns.name = "NonStringCol"

self.check_error_on_write(df, engine, ValueError)


class TestParquetPyArrow(Base):
def test_basic(self, pa, df_full):
Expand Down