Skip to content

Commit 2cae2fe

Browse files
committed
closes #23283
1 parent caea25a commit 2cae2fe

File tree

3 files changed

+36
-8
lines changed

3 files changed

+36
-8
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,7 @@ Other Enhancements
213213
- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`).
214214
- Compatibility with Matplotlib 3.0 (:issue:`22790`).
215215
- Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`)
216+
- :func:`~DataFrame.to_parquet` now supports writing a DataFrame as a directory of parquet files partitioned by a subset of the columns. (:issue:`23283`).
216217
- :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have gained the ``nonexistent`` argument for alternative handling of nonexistent times. See :ref:`timeseries.timezone_nonexsistent` (:issue:`8917`)
217218

218219
.. _whatsnew_0240.api_breaking:

pandas/io/parquet.py

+12-7
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ def get_engine(engine):
4242

4343

4444
class BaseImpl(object):
45-
4645
api = None # module
4746

4847
@staticmethod
@@ -97,9 +96,9 @@ def __init__(self):
9796
)
9897

9998
self._pyarrow_lt_060 = (
100-
LooseVersion(pyarrow.__version__) < LooseVersion('0.6.0'))
99+
LooseVersion(pyarrow.__version__) < LooseVersion('0.6.0'))
101100
self._pyarrow_lt_070 = (
102-
LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'))
101+
LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'))
103102

104103
self.api = pyarrow
105104

@@ -125,9 +124,14 @@ def write(self, df, path, compression='snappy',
125124

126125
else:
127126
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
128-
self.api.parquet.write_table(
129-
table, path, compression=compression,
130-
coerce_timestamps=coerce_timestamps, **kwargs)
127+
if 'partition_cols' in kwargs:
128+
self.api.parquet.write_to_dataset(
129+
table, path, compression=compression,
130+
coerce_timestamps=coerce_timestamps, **kwargs)
131+
else:
132+
self.api.parquet.write_table(
133+
table, path, compression=compression,
134+
coerce_timestamps=coerce_timestamps, **kwargs)
131135

132136
def read(self, path, columns=None, **kwargs):
133137
path, _, _, should_close = get_filepath_or_buffer(path)
@@ -252,7 +256,8 @@ def to_parquet(df, path, engine='auto', compression='snappy', index=None,
252256
----------
253257
df : DataFrame
254258
path : string
255-
File path
259+
File path ( Will be used as `root_path` if
260+
`partition_cols` is provided as parameter for 'pyarrow' engine).
256261
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
257262
Parquet library to use. If 'auto', then the option
258263
``io.parquet.engine`` is used. The default ``io.parquet.engine``

pandas/tests/io/test_parquet.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
""" test parquet compat """
22

33
import pytest
4+
import tempfile
5+
import shutil
46
import datetime
57
from distutils.version import LooseVersion
68
from warnings import catch_warnings
@@ -14,12 +16,14 @@
1416

1517
try:
1618
import pyarrow # noqa
19+
1720
_HAVE_PYARROW = True
1821
except ImportError:
1922
_HAVE_PYARROW = False
2023

2124
try:
2225
import fastparquet # noqa
26+
2327
_HAVE_FASTPARQUET = True
2428
except ImportError:
2529
_HAVE_FASTPARQUET = False
@@ -406,7 +410,6 @@ def test_write_ignoring_index(self, engine):
406410
class TestParquetPyArrow(Base):
407411

408412
def test_basic(self, pa, df_full):
409-
410413
df = df_full
411414

412415
# additional supported types for pyarrow
@@ -478,6 +481,25 @@ def test_s3_roundtrip(self, df_compat, s3_resource, pa):
478481
check_round_trip(df_compat, pa,
479482
path='s3://pandas-test/pyarrow.parquet')
480483

484+
def test_partition_cols_supported(self, pa_ge_070, df_full):
485+
partition_cols = ['bool', 'int']
486+
df = df_full
487+
path = tempfile.mkdtemp()
488+
df.to_parquet(path, partition_cols=partition_cols,
489+
compression=None)
490+
import pyarrow.parquet as pq
491+
dataset = pq.ParquetDataset(path, validate_schema=False)
492+
assert len(dataset.pieces) == 2
493+
assert len(dataset.partitions.partition_names) == 2
494+
assert dataset.partitions.partition_names == set(partition_cols)
495+
shutil.rmtree(path)
496+
497+
def test_ignore_partition_cols_lt_070(self, pa_lt_070, df_full):
498+
partition_cols = ['bool', 'int']
499+
pa = pa_lt_070
500+
df = df_full
501+
check_round_trip(df, pa, write_kwargs={'partition_cols': partition_cols})
502+
481503

482504
class TestParquetFastParquet(Base):
483505

0 commit comments

Comments
 (0)