Skip to content

Commit fea45ba

Browse files
phoflmroeschke
andauthored
Backport PR #52036 on branch 2.0.x (BUG: Remove unnecessary validation to non-string columns/index in df.to_parquet) (#52044)
BUG: Remove unnecessary validation to non-string columns/index in df.to_parquet (#52036) Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 1cde5ec commit fea45ba

File tree

3 files changed

+79
-60
lines changed

3 files changed

+79
-60
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1293,6 +1293,7 @@ I/O
12931293
- Bug in :func:`read_csv` when ``engine="pyarrow"`` where ``encoding`` parameter was not handled correctly (:issue:`51302`)
12941294
- Bug in :func:`read_xml` ignored repeated elements when iterparse is used (:issue:`51183`)
12951295
- Bug in :class:`ExcelWriter` leaving file handles open if an exception occurred during instantiation (:issue:`51443`)
1296+
- Bug in :meth:`DataFrame.to_parquet` where non-string index or columns were raising a ``ValueError`` when ``engine="pyarrow"`` (:issue:`52036`)
12961297

12971298
Period
12981299
^^^^^^

pandas/io/parquet.py

-23
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
import pandas as pd
2828
from pandas import (
2929
DataFrame,
30-
MultiIndex,
3130
get_option,
3231
)
3332
from pandas.core.shared_docs import _shared_docs
@@ -122,28 +121,6 @@ def validate_dataframe(df: DataFrame) -> None:
122121
if not isinstance(df, DataFrame):
123122
raise ValueError("to_parquet only supports IO with DataFrames")
124123

125-
# must have value column names for all index levels (strings only)
126-
if isinstance(df.columns, MultiIndex):
127-
if not all(
128-
x.inferred_type in {"string", "empty"} for x in df.columns.levels
129-
):
130-
raise ValueError(
131-
"""
132-
parquet must have string column names for all values in
133-
each level of the MultiIndex
134-
"""
135-
)
136-
else:
137-
if df.columns.inferred_type not in {"string", "empty"}:
138-
raise ValueError("parquet must have string column names")
139-
140-
# index level names must be strings
141-
valid_names = all(
142-
isinstance(name, str) for name in df.index.names if name is not None
143-
)
144-
if not valid_names:
145-
raise ValueError("Index level names must be strings")
146-
147124
def write(self, df: DataFrame, path, compression, **kwargs):
148125
raise AbstractMethodError(self)
149126

pandas/tests/io/test_parquet.py

+78-37
Original file line numberDiff line numberDiff line change
@@ -404,25 +404,6 @@ def test_columns_dtypes(self, engine):
404404
df.columns = ["foo", "bar"]
405405
check_round_trip(df, engine)
406406

407-
def test_columns_dtypes_invalid(self, engine):
408-
df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))})
409-
410-
msg = "parquet must have string column names"
411-
# numeric
412-
df.columns = [0, 1]
413-
self.check_error_on_write(df, engine, ValueError, msg)
414-
415-
# bytes
416-
df.columns = [b"foo", b"bar"]
417-
self.check_error_on_write(df, engine, ValueError, msg)
418-
419-
# python object
420-
df.columns = [
421-
datetime.datetime(2011, 1, 1, 0, 0),
422-
datetime.datetime(2011, 1, 1, 1, 1),
423-
]
424-
self.check_error_on_write(df, engine, ValueError, msg)
425-
426407
@pytest.mark.parametrize("compression", [None, "gzip", "snappy", "brotli"])
427408
def test_compression(self, engine, compression):
428409
if compression == "snappy":
@@ -528,16 +509,16 @@ def test_write_column_multiindex(self, engine):
528509
# Not able to write column multi-indexes with non-string column names.
529510
mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])
530511
df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns)
531-
msg = (
532-
r"\s*parquet must have string column names for all values in\s*"
533-
"each level of the MultiIndex"
534-
)
535-
self.check_error_on_write(df, engine, ValueError, msg)
536512

537-
def test_write_column_multiindex_nonstring(self, pa):
513+
if engine == "fastparquet":
514+
self.check_error_on_write(
515+
df, engine, TypeError, "Column name must be a string"
516+
)
517+
elif engine == "pyarrow":
518+
check_round_trip(df, engine)
519+
520+
def test_write_column_multiindex_nonstring(self, engine):
538521
# GH #34777
539-
# Not supported in fastparquet as of 0.1.3
540-
engine = pa
541522

542523
# Not able to write column multi-indexes with non-string column names
543524
arrays = [
@@ -546,11 +527,14 @@ def test_write_column_multiindex_nonstring(self, pa):
546527
]
547528
df = pd.DataFrame(np.random.randn(8, 8), columns=arrays)
548529
df.columns.names = ["Level1", "Level2"]
549-
msg = (
550-
r"\s*parquet must have string column names for all values in\s*"
551-
"each level of the MultiIndex"
552-
)
553-
self.check_error_on_write(df, engine, ValueError, msg)
530+
if engine == "fastparquet":
531+
if Version(fastparquet.__version__) < Version("0.7.0"):
532+
err = TypeError
533+
else:
534+
err = ValueError
535+
self.check_error_on_write(df, engine, err, "Column name")
536+
elif engine == "pyarrow":
537+
check_round_trip(df, engine)
554538

555539
def test_write_column_multiindex_string(self, pa):
556540
# GH #34777
@@ -579,17 +563,19 @@ def test_write_column_index_string(self, pa):
579563

580564
check_round_trip(df, engine)
581565

582-
def test_write_column_index_nonstring(self, pa):
566+
def test_write_column_index_nonstring(self, engine):
583567
# GH #34777
584-
# Not supported in fastparquet as of 0.1.3
585-
engine = pa
586568

587569
# Write column indexes with string column names
588570
arrays = [1, 2, 3, 4]
589571
df = pd.DataFrame(np.random.randn(8, 4), columns=arrays)
590572
df.columns.name = "NonStringCol"
591-
msg = r"parquet must have string column names"
592-
self.check_error_on_write(df, engine, ValueError, msg)
573+
if engine == "fastparquet":
574+
self.check_error_on_write(
575+
df, engine, TypeError, "Column name must be a string"
576+
)
577+
else:
578+
check_round_trip(df, engine)
593579

594580
@pytest.mark.skipif(pa_version_under7p0, reason="minimum pyarrow not installed")
595581
def test_dtype_backend(self, engine, request):
@@ -1041,6 +1027,31 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa):
10411027
expected=expected,
10421028
)
10431029

1030+
def test_columns_dtypes_not_invalid(self, pa):
1031+
df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))})
1032+
1033+
# numeric
1034+
df.columns = [0, 1]
1035+
check_round_trip(df, pa)
1036+
1037+
# bytes
1038+
df.columns = [b"foo", b"bar"]
1039+
with pytest.raises(NotImplementedError, match="|S3"):
1040+
# Bytes fails on read_parquet
1041+
check_round_trip(df, pa)
1042+
1043+
# python object
1044+
df.columns = [
1045+
datetime.datetime(2011, 1, 1, 0, 0),
1046+
datetime.datetime(2011, 1, 1, 1, 1),
1047+
]
1048+
check_round_trip(df, pa)
1049+
1050+
def test_empty_columns(self, pa):
1051+
# GH 52034
1052+
df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name"))
1053+
check_round_trip(df, pa)
1054+
10441055

10451056
class TestParquetFastParquet(Base):
10461057
def test_basic(self, fp, df_full):
@@ -1052,6 +1063,27 @@ def test_basic(self, fp, df_full):
10521063
df["timedelta"] = pd.timedelta_range("1 day", periods=3)
10531064
check_round_trip(df, fp)
10541065

1066+
def test_columns_dtypes_invalid(self, fp):
1067+
df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))})
1068+
1069+
err = TypeError
1070+
msg = "Column name must be a string"
1071+
1072+
# numeric
1073+
df.columns = [0, 1]
1074+
self.check_error_on_write(df, fp, err, msg)
1075+
1076+
# bytes
1077+
df.columns = [b"foo", b"bar"]
1078+
self.check_error_on_write(df, fp, err, msg)
1079+
1080+
# python object
1081+
df.columns = [
1082+
datetime.datetime(2011, 1, 1, 0, 0),
1083+
datetime.datetime(2011, 1, 1, 1, 1),
1084+
]
1085+
self.check_error_on_write(df, fp, err, msg)
1086+
10551087
def test_duplicate_columns(self, fp):
10561088
# not currently able to handle duplicate columns
10571089
df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy()
@@ -1221,3 +1253,12 @@ def test_invalid_dtype_backend(self, engine):
12211253
df.to_parquet(path)
12221254
with pytest.raises(ValueError, match=msg):
12231255
read_parquet(path, dtype_backend="numpy")
1256+
1257+
def test_empty_columns(self, fp):
1258+
# GH 52034
1259+
df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name"))
1260+
expected = pd.DataFrame(
1261+
columns=pd.Index([], dtype=object),
1262+
index=pd.Index(["a", "b", "c"], name="custom name"),
1263+
)
1264+
check_round_trip(df, fp, expected=expected)

0 commit comments

Comments
 (0)