diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 29060a93923eb..4f4ace502d443 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -204,6 +204,7 @@ Other enhancements - Roundtripping DataFrames with nullable integer or string data types to parquet (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`). +- The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`) Build Changes ^^^^^^^^^^^^^ diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index a044cfcdf6a01..51eac8d481231 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -234,7 +234,7 @@ def to_parquet( .. versionadded:: 0.24.0 - partition_cols : list, optional, default None + partition_cols : str or list, optional, default None Column names by which to partition the dataset Columns are partitioned in the order they are given @@ -243,6 +243,8 @@ def to_parquet( kwargs Additional keyword arguments passed to the engine """ + if isinstance(partition_cols, str): + partition_cols = [partition_cols] impl = get_engine(engine) return impl.write( df, diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index a98c93c250070..251548e7caaab 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -499,6 +499,19 @@ def test_partition_cols_supported(self, pa, df_full): assert len(dataset.partitions.partition_names) == 2 assert dataset.partitions.partition_names == set(partition_cols) + def test_partition_cols_string(self, pa, df_full): + # GH #27117 + partition_cols = "bool" + partition_cols_list = [partition_cols] + df = df_full + with tm.ensure_clean_dir() as path: + df.to_parquet(path, partition_cols=partition_cols, compression=None) + import pyarrow.parquet as pq + + dataset = pq.ParquetDataset(path, validate_schema=False) + assert len(dataset.partitions.partition_names) == 1 + assert dataset.partitions.partition_names == set(partition_cols_list) + def test_empty_dataframe(self, pa): # GH #27339 df = pd.DataFrame() @@ -595,6 +608,23 @@ def test_partition_cols_supported(self, fp, df_full): actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 2 + def test_partition_cols_string(self, fp, df_full): + # GH #27117 + partition_cols = "bool" + df = df_full + with tm.ensure_clean_dir() as path: + df.to_parquet( + path, + engine="fastparquet", + partition_cols=partition_cols, + compression=None, + ) + assert os.path.exists(path) + import fastparquet # noqa: F811 + + actual_partition_cols = fastparquet.ParquetFile(path, False).cats + assert len(actual_partition_cols) == 1 + def test_partition_on_supported(self, fp, df_full): # GH #23283 partition_cols = ["bool", "int"]