Skip to content

ENH: Support string arguments for partition_cols in pandas.to_parquet #30213

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ Other enhancements
- Roundtripping DataFrames with nullable integer or string data types to parquet
(:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine
now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`).
- The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`)

Build Changes
^^^^^^^^^^^^^
Expand Down
4 changes: 3 additions & 1 deletion pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def to_parquet(

.. versionadded:: 0.24.0

partition_cols : list, optional, default None
partition_cols : str or list, optional, default None
Column names by which to partition the dataset
Columns are partitioned in the order they are given

Expand All @@ -243,6 +243,8 @@ def to_parquet(
kwargs
Additional keyword arguments passed to the engine
"""
if isinstance(partition_cols, str):
partition_cols = [partition_cols]
impl = get_engine(engine)
return impl.write(
df,
Expand Down
30 changes: 30 additions & 0 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,19 @@ def test_partition_cols_supported(self, pa, df_full):
assert len(dataset.partitions.partition_names) == 2
assert dataset.partitions.partition_names == set(partition_cols)

def test_partition_cols_string(self, pa, df_full):
# GH #27117
partition_cols = "bool"
partition_cols_list = [partition_cols]
df = df_full
with tm.ensure_clean_dir() as path:
df.to_parquet(path, partition_cols=partition_cols, compression=None)
import pyarrow.parquet as pq

dataset = pq.ParquetDataset(path, validate_schema=False)
assert len(dataset.partitions.partition_names) == 1
assert dataset.partitions.partition_names == set(partition_cols_list)

def test_empty_dataframe(self, pa):
# GH #27339
df = pd.DataFrame()
Expand Down Expand Up @@ -595,6 +608,23 @@ def test_partition_cols_supported(self, fp, df_full):
actual_partition_cols = fastparquet.ParquetFile(path, False).cats
assert len(actual_partition_cols) == 2

def test_partition_cols_string(self, fp, df_full):
# GH #27117
partition_cols = "bool"
df = df_full
with tm.ensure_clean_dir() as path:
df.to_parquet(
path,
engine="fastparquet",
partition_cols=partition_cols,
compression=None,
)
assert os.path.exists(path)
import fastparquet # noqa: F811

actual_partition_cols = fastparquet.ParquetFile(path, False).cats
assert len(actual_partition_cols) == 1

def test_partition_on_supported(self, fp, df_full):
# GH #23283
partition_cols = ["bool", "int"]
Expand Down