Skip to content

Commit 3904942

Browse files
Blake Hawkinsjorisvandenbossche
Blake Hawkins
authored andcommitted
ENH: Support string arguments for partition_cols in pandas.to_parquet (#30213)
1 parent 9294f91 commit 3904942

File tree

3 files changed

+34
-1
lines changed

3 files changed

+34
-1
lines changed

doc/source/whatsnew/v1.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,7 @@ Other enhancements
204204
- Roundtripping DataFrames with nullable integer or string data types to parquet
205205
(:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine
206206
now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`).
207+
- The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`)
207208

208209
Build Changes
209210
^^^^^^^^^^^^^

pandas/io/parquet.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ def to_parquet(
234234
235235
.. versionadded:: 0.24.0
236236
237-
partition_cols : list, optional, default None
237+
partition_cols : str or list, optional, default None
238238
Column names by which to partition the dataset
239239
Columns are partitioned in the order they are given
240240
@@ -243,6 +243,8 @@ def to_parquet(
243243
kwargs
244244
Additional keyword arguments passed to the engine
245245
"""
246+
if isinstance(partition_cols, str):
247+
partition_cols = [partition_cols]
246248
impl = get_engine(engine)
247249
return impl.write(
248250
df,

pandas/tests/io/test_parquet.py

+30
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,19 @@ def test_partition_cols_supported(self, pa, df_full):
499499
assert len(dataset.partitions.partition_names) == 2
500500
assert dataset.partitions.partition_names == set(partition_cols)
501501

502+
def test_partition_cols_string(self, pa, df_full):
503+
# GH #27117
504+
partition_cols = "bool"
505+
partition_cols_list = [partition_cols]
506+
df = df_full
507+
with tm.ensure_clean_dir() as path:
508+
df.to_parquet(path, partition_cols=partition_cols, compression=None)
509+
import pyarrow.parquet as pq
510+
511+
dataset = pq.ParquetDataset(path, validate_schema=False)
512+
assert len(dataset.partitions.partition_names) == 1
513+
assert dataset.partitions.partition_names == set(partition_cols_list)
514+
502515
def test_empty_dataframe(self, pa):
503516
# GH #27339
504517
df = pd.DataFrame()
@@ -595,6 +608,23 @@ def test_partition_cols_supported(self, fp, df_full):
595608
actual_partition_cols = fastparquet.ParquetFile(path, False).cats
596609
assert len(actual_partition_cols) == 2
597610

611+
def test_partition_cols_string(self, fp, df_full):
612+
# GH #27117
613+
partition_cols = "bool"
614+
df = df_full
615+
with tm.ensure_clean_dir() as path:
616+
df.to_parquet(
617+
path,
618+
engine="fastparquet",
619+
partition_cols=partition_cols,
620+
compression=None,
621+
)
622+
assert os.path.exists(path)
623+
import fastparquet # noqa: F811
624+
625+
actual_partition_cols = fastparquet.ParquetFile(path, False).cats
626+
assert len(actual_partition_cols) == 1
627+
598628
def test_partition_on_supported(self, fp, df_full):
599629
# GH #23283
600630
partition_cols = ["bool", "int"]

0 commit comments

Comments
 (0)