Skip to content

Commit d2ec124

Browse files
committed
closes #23283
1 parent caea25a commit d2ec124

File tree

3 files changed

+29
-5
lines changed

3 files changed

+29
-5
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,7 @@ Other Enhancements
213213
- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`).
214214
- Compatibility with Matplotlib 3.0 (:issue:`22790`).
215215
- Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`)
216+
- :func:`~DataFrame.to_parquet` now support ``partition_cols`` where one can give partition columns list while writing to parquet (:issue:`23283`).
216217
- :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have gained the ``nonexistent`` argument for alternative handling of nonexistent times. See :ref:`timeseries.timezone_nonexsistent` (:issue:`8917`)
217218

218219
.. _whatsnew_0240.api_breaking:

pandas/io/parquet.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -121,13 +121,19 @@ def write(self, df, path, compression='snappy',
121121
table = self.api.Table.from_pandas(df, timestamps_to_ms=True,
122122
**from_pandas_kwargs)
123123
self.api.parquet.write_table(
124-
table, path, compression=compression, **kwargs)
124+
table, path, compression=compression, **kwargs)
125125

126126
else:
127127
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
128-
self.api.parquet.write_table(
129-
table, path, compression=compression,
130-
coerce_timestamps=coerce_timestamps, **kwargs)
128+
if ('partition_cols' in kwargs and
129+
len(kwargs['partition_cols']) > 0):
130+
self.api.parquet.write_to_dataset(
131+
table, path, compression=compression,
132+
coerce_timestamps=coerce_timestamps, **kwargs)
133+
else:
134+
self.api.parquet.write_table(
135+
table, path, compression=compression,
136+
coerce_timestamps=coerce_timestamps, **kwargs)
131137

132138
def read(self, path, columns=None, **kwargs):
133139
path, _, _, should_close = get_filepath_or_buffer(path)
@@ -252,7 +258,8 @@ def to_parquet(df, path, engine='auto', compression='snappy', index=None,
252258
----------
253259
df : DataFrame
254260
path : string
255-
File path
261+
File path ( Will be used as Directory path if
262+
'partition_cols' is provided as parameter for 'pyarrow' engine).
256263
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
257264
Parquet library to use. If 'auto', then the option
258265
``io.parquet.engine`` is used. The default ``io.parquet.engine``

pandas/tests/io/test_parquet.py

+16
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
""" test parquet compat """
22

33
import pytest
4+
import tempfile
5+
import shutil
46
import datetime
57
from distutils.version import LooseVersion
68
from warnings import catch_warnings
@@ -478,6 +480,20 @@ def test_s3_roundtrip(self, df_compat, s3_resource, pa):
478480
check_round_trip(df_compat, pa,
479481
path='s3://pandas-test/pyarrow.parquet')
480482

483+
def test_partition_cols_supported(self, pa, df_full):
484+
pa = pa_lt_070
485+
partition_cols = ['bool', 'int']
486+
df = df_full
487+
path = tempfile.mkdtemp()
488+
# supported in >= 0.7.0
489+
df.to_parquet(path, partition_cols=partition_cols,
490+
compression=None)
491+
import pyarrow.parquet as pq
492+
dataset = pq.ParquetDataset(path, validate_schema=False)
493+
assert len(dataset.pieces) > 0
494+
assert len(dataset.partitions.partition_names) > 0
495+
assert dataset.partitions.partition_names == set(partition_cols)
496+
shutil.rmtree(path)
481497

482498
class TestParquetFastParquet(Base):
483499

0 commit comments

Comments
 (0)