diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 503f6a50554b8..3afa7ee7c3f03 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1293,6 +1293,7 @@ I/O - Bug in :func:`read_csv` when ``engine="pyarrow"`` where ``encoding`` parameter was not handled correctly (:issue:`51302`) - Bug in :func:`read_xml` ignored repeated elements when iterparse is used (:issue:`51183`) - Bug in :class:`ExcelWriter` leaving file handles open if an exception occurred during instantiation (:issue:`51443`) +- Bug in :meth:`DataFrame.to_parquet` where non-string index or columns were raising a ``ValueError`` when ``engine="pyarrow"`` (:issue:`52036`) Period ^^^^^^ diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 7857464fc880e..18d9649a3102c 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -27,7 +27,6 @@ import pandas as pd from pandas import ( DataFrame, - MultiIndex, get_option, ) from pandas.core.shared_docs import _shared_docs @@ -122,28 +121,6 @@ def validate_dataframe(df: DataFrame) -> None: if not isinstance(df, DataFrame): raise ValueError("to_parquet only supports IO with DataFrames") - # must have value column names for all index levels (strings only) - if isinstance(df.columns, MultiIndex): - if not all( - x.inferred_type in {"string", "empty"} for x in df.columns.levels - ): - raise ValueError( - """ - parquet must have string column names for all values in - each level of the MultiIndex - """ - ) - else: - if df.columns.inferred_type not in {"string", "empty"}: - raise ValueError("parquet must have string column names") - - # index level names must be strings - valid_names = all( - isinstance(name, str) for name in df.index.names if name is not None - ) - if not valid_names: - raise ValueError("Index level names must be strings") - def write(self, df: DataFrame, path, compression, **kwargs): raise AbstractMethodError(self) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 1548208c7eeaa..b9efcecb0c3eb 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -404,25 +404,6 @@ def test_columns_dtypes(self, engine): df.columns = ["foo", "bar"] check_round_trip(df, engine) - def test_columns_dtypes_invalid(self, engine): - df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) - - msg = "parquet must have string column names" - # numeric - df.columns = [0, 1] - self.check_error_on_write(df, engine, ValueError, msg) - - # bytes - df.columns = [b"foo", b"bar"] - self.check_error_on_write(df, engine, ValueError, msg) - - # python object - df.columns = [ - datetime.datetime(2011, 1, 1, 0, 0), - datetime.datetime(2011, 1, 1, 1, 1), - ] - self.check_error_on_write(df, engine, ValueError, msg) - @pytest.mark.parametrize("compression", [None, "gzip", "snappy", "brotli"]) def test_compression(self, engine, compression): if compression == "snappy": @@ -528,16 +509,16 @@ def test_write_column_multiindex(self, engine): # Not able to write column multi-indexes with non-string column names. mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns) - msg = ( - r"\s*parquet must have string column names for all values in\s*" - "each level of the MultiIndex" - ) - self.check_error_on_write(df, engine, ValueError, msg) - def test_write_column_multiindex_nonstring(self, pa): + if engine == "fastparquet": + self.check_error_on_write( + df, engine, TypeError, "Column name must be a string" + ) + elif engine == "pyarrow": + check_round_trip(df, engine) + + def test_write_column_multiindex_nonstring(self, engine): # GH #34777 - # Not supported in fastparquet as of 0.1.3 - engine = pa # Not able to write column multi-indexes with non-string column names arrays = [ @@ -546,11 +527,14 @@ def test_write_column_multiindex_nonstring(self, pa): ] df = pd.DataFrame(np.random.randn(8, 8), columns=arrays) df.columns.names = ["Level1", "Level2"] - msg = ( - r"\s*parquet must have string column names for all values in\s*" - "each level of the MultiIndex" - ) - self.check_error_on_write(df, engine, ValueError, msg) + if engine == "fastparquet": + if Version(fastparquet.__version__) < Version("0.7.0"): + err = TypeError + else: + err = ValueError + self.check_error_on_write(df, engine, err, "Column name") + elif engine == "pyarrow": + check_round_trip(df, engine) def test_write_column_multiindex_string(self, pa): # GH #34777 @@ -579,17 +563,19 @@ def test_write_column_index_string(self, pa): check_round_trip(df, engine) - def test_write_column_index_nonstring(self, pa): + def test_write_column_index_nonstring(self, engine): # GH #34777 - # Not supported in fastparquet as of 0.1.3 - engine = pa # Write column indexes with string column names arrays = [1, 2, 3, 4] df = pd.DataFrame(np.random.randn(8, 4), columns=arrays) df.columns.name = "NonStringCol" - msg = r"parquet must have string column names" - self.check_error_on_write(df, engine, ValueError, msg) + if engine == "fastparquet": + self.check_error_on_write( + df, engine, TypeError, "Column name must be a string" + ) + else: + check_round_trip(df, engine) @pytest.mark.skipif(pa_version_under7p0, reason="minimum pyarrow not installed") def test_dtype_backend(self, engine, request): @@ -1041,6 +1027,31 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected=expected, ) + def test_columns_dtypes_not_invalid(self, pa): + df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) + + # numeric + df.columns = [0, 1] + check_round_trip(df, pa) + + # bytes + df.columns = [b"foo", b"bar"] + with pytest.raises(NotImplementedError, match="|S3"): + # Bytes fails on read_parquet + check_round_trip(df, pa) + + # python object + df.columns = [ + datetime.datetime(2011, 1, 1, 0, 0), + datetime.datetime(2011, 1, 1, 1, 1), + ] + check_round_trip(df, pa) + + def test_empty_columns(self, pa): + # GH 52034 + df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")) + check_round_trip(df, pa) + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full): @@ -1052,6 +1063,27 @@ def test_basic(self, fp, df_full): df["timedelta"] = pd.timedelta_range("1 day", periods=3) check_round_trip(df, fp) + def test_columns_dtypes_invalid(self, fp): + df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) + + err = TypeError + msg = "Column name must be a string" + + # numeric + df.columns = [0, 1] + self.check_error_on_write(df, fp, err, msg) + + # bytes + df.columns = [b"foo", b"bar"] + self.check_error_on_write(df, fp, err, msg) + + # python object + df.columns = [ + datetime.datetime(2011, 1, 1, 0, 0), + datetime.datetime(2011, 1, 1, 1, 1), + ] + self.check_error_on_write(df, fp, err, msg) + def test_duplicate_columns(self, fp): # not currently able to handle duplicate columns df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy() @@ -1221,3 +1253,12 @@ def test_invalid_dtype_backend(self, engine): df.to_parquet(path) with pytest.raises(ValueError, match=msg): read_parquet(path, dtype_backend="numpy") + + def test_empty_columns(self, fp): + # GH 52034 + df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")) + expected = pd.DataFrame( + columns=pd.Index([], dtype=object), + index=pd.Index(["a", "b", "c"], name="custom name"), + ) + check_round_trip(df, fp, expected=expected)