Skip to content

Commit 1636681

Browse files
committed
Update documentation
using with to clean directory support for fastparquet
1 parent 0d9f878 commit 1636681

File tree

6 files changed

+85
-24
lines changed

6 files changed

+85
-24
lines changed

doc/source/io.rst

+2
Original file line numberDiff line numberDiff line change
@@ -4574,6 +4574,8 @@ Several caveats.
45744574
* Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype.
45754575
* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
45764576
on an attempt at serialization.
4577+
* ``partition_cols`` will be used for partitioning the dataset, where the dataset will be written to multiple
4578+
files in the path specified. Therefore, the path specified, must be a directory path.
45774579

45784580
You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``.
45794581
If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``,

pandas/core/frame.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -1970,7 +1970,7 @@ def to_feather(self, fname):
19701970
to_feather(self, fname)
19711971

19721972
def to_parquet(self, fname, engine='auto', compression='snappy',
1973-
index=None, **kwargs):
1973+
index=None, partition_cols=None, **kwargs):
19741974
"""
19751975
Write a DataFrame to the binary parquet format.
19761976
@@ -1984,7 +1984,8 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
19841984
Parameters
19851985
----------
19861986
fname : str
1987-
String file path.
1987+
File path or Root Directory path. Will be used as Root Directory
1988+
path while writing a partitioned dataset.
19881989
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
19891990
Parquet library to use. If 'auto', then the option
19901991
``io.parquet.engine`` is used. The default ``io.parquet.engine``
@@ -1998,6 +1999,12 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
19981999
the behavior depends on the chosen engine.
19992000
20002001
.. versionadded:: 0.24.0
2002+
partition_cols : list, optional, default None
2003+
Column names by which to partition the dataset
2004+
Columns are partitioned in the order they are given
2005+
The behaviour applies only to pyarrow >= 0.7.0 and fastparquet
2006+
For other versions, this argument will be ignored.
2007+
.. versionadded:: 0.24.0
20012008
20022009
**kwargs
20032010
Additional arguments passed to the parquet library. See
@@ -2027,7 +2034,8 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
20272034
"""
20282035
from pandas.io.parquet import to_parquet
20292036
to_parquet(self, fname, engine,
2030-
compression=compression, index=index, **kwargs)
2037+
compression=compression, index=index,
2038+
partition_cols=partition_cols, **kwargs)
20312039

20322040
@Substitution(header='Write out the column names. If a list of strings '
20332041
'is given, it is assumed to be aliases for the '

pandas/io/parquet.py

+24-11
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,8 @@ def __init__(self):
104104
self.api = pyarrow
105105

106106
def write(self, df, path, compression='snappy',
107-
coerce_timestamps='ms', index=None, **kwargs):
107+
coerce_timestamps='ms', index=None, partition_cols=None,
108+
**kwargs):
108109
self.validate_dataframe(df)
109110

110111
# Only validate the index if we're writing it.
@@ -125,10 +126,11 @@ def write(self, df, path, compression='snappy',
125126

126127
else:
127128
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
128-
if 'partition_cols' in kwargs:
129+
if partition_cols is not None:
129130
self.api.parquet.write_to_dataset(
130131
table, path, compression=compression,
131-
coerce_timestamps=coerce_timestamps, **kwargs)
132+
coerce_timestamps=coerce_timestamps,
133+
partition_cols=partition_cols, **kwargs)
132134
else:
133135
self.api.parquet.write_table(
134136
table, path, compression=compression,
@@ -211,12 +213,16 @@ def __init__(self):
211213
)
212214
self.api = fastparquet
213215

214-
def write(self, df, path, compression='snappy', index=None, **kwargs):
216+
def write(self, df, path, compression='snappy', index=None,
217+
partition_cols=None, **kwargs):
215218
self.validate_dataframe(df)
216219
# thriftpy/protocol/compact.py:339:
217220
# DeprecationWarning: tostring() is deprecated.
218221
# Use tobytes() instead.
219222

223+
if partition_cols is not None:
224+
kwargs['file_scheme'] = 'hive'
225+
220226
if is_s3_url(path):
221227
# path is s3:// so we need to open the s3file in 'wb' mode.
222228
# TODO: Support 'ab'
@@ -229,7 +235,8 @@ def write(self, df, path, compression='snappy', index=None, **kwargs):
229235

230236
with catch_warnings(record=True):
231237
self.api.write(path, df, compression=compression,
232-
write_index=index, **kwargs)
238+
write_index=index, partition_on=partition_cols,
239+
**kwargs)
233240

234241
def read(self, path, columns=None, **kwargs):
235242
if is_s3_url(path):
@@ -249,16 +256,15 @@ def read(self, path, columns=None, **kwargs):
249256

250257

251258
def to_parquet(df, path, engine='auto', compression='snappy', index=None,
252-
**kwargs):
259+
partition_cols=None, **kwargs):
253260
"""
254261
Write a DataFrame to the parquet format.
255262
256263
Parameters
257264
----------
258-
df : DataFrame
259-
path : string
260-
File path ( Will be used as `root_path` if
261-
`partition_cols` is provided as parameter for 'pyarrow' engine).
265+
path : str
266+
File path or Root Directory path. Will be used as Root Directory path
267+
while writing a partitioned dataset.
262268
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
263269
Parquet library to use. If 'auto', then the option
264270
``io.parquet.engine`` is used. The default ``io.parquet.engine``
@@ -272,11 +278,18 @@ def to_parquet(df, path, engine='auto', compression='snappy', index=None,
272278
engine's default behavior will be used.
273279
274280
.. versionadded 0.24.0
281+
partition_cols : list, optional
282+
Column names by which to partition the dataset
283+
Columns are partitioned in the order they are given
284+
The behaviour applies only to pyarrow >= 0.7.0 and fastparquet
285+
For other versions, this argument will be ignored.
286+
.. versionadded:: 0.24.0
275287
kwargs
276288
Additional keyword arguments passed to the engine
277289
"""
278290
impl = get_engine(engine)
279-
return impl.write(df, path, compression=compression, index=index, **kwargs)
291+
return impl.write(df, path, compression=compression, index=index,
292+
partition_cols=partition_cols, **kwargs)
280293

281294

282295
def read_parquet(path, engine='auto', columns=None, **kwargs):

pandas/tests/io/test_parquet.py

+22-10
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
""" test parquet compat """
2+
import os
23

34
import pytest
4-
import tempfile
5-
import shutil
65
import datetime
76
from distutils.version import LooseVersion
87
from warnings import catch_warnings
@@ -481,18 +480,19 @@ def test_s3_roundtrip(self, df_compat, s3_resource, pa):
481480
path='s3://pandas-test/pyarrow.parquet')
482481

483482
def test_partition_cols_supported(self, pa_ge_070, df_full):
483+
# GH #23283
484484
partition_cols = ['bool', 'int']
485485
df = df_full
486-
path = tempfile.mkdtemp()
487-
df.to_parquet(path, partition_cols=partition_cols,
488-
compression=None)
489-
import pyarrow.parquet as pq
490-
dataset = pq.ParquetDataset(path, validate_schema=False)
491-
assert len(dataset.partitions.partition_names) == 2
492-
assert dataset.partitions.partition_names == set(partition_cols)
493-
shutil.rmtree(path)
486+
with tm.ensure_clean_dir() as path:
487+
df.to_parquet(path, partition_cols=partition_cols,
488+
compression=None)
489+
import pyarrow.parquet as pq
490+
dataset = pq.ParquetDataset(path, validate_schema=False)
491+
assert len(dataset.partitions.partition_names) == 2
492+
assert dataset.partitions.partition_names == set(partition_cols)
494493

495494
def test_ignore_partition_cols_lt_070(self, pa_lt_070, df_full):
495+
# GH #23283
496496
partition_cols = ['bool', 'int']
497497
pa = pa_lt_070
498498
df = df_full
@@ -564,3 +564,15 @@ def test_s3_roundtrip(self, df_compat, s3_resource, fp):
564564
# GH #19134
565565
check_round_trip(df_compat, fp,
566566
path='s3://pandas-test/fastparquet.parquet')
567+
568+
def test_partition_cols_supported(self, fp, df_full):
569+
# GH #23283
570+
partition_cols = ['bool', 'int']
571+
df = df_full
572+
with tm.ensure_clean_dir() as path:
573+
df.to_parquet(path, partition_cols=partition_cols,
574+
compression=None)
575+
assert os.path.exists(path)
576+
import fastparquet
577+
actual_partition_cols = fastparquet.ParquetFile(path, False).cats
578+
assert len(actual_partition_cols) == 2

pandas/tests/util/test_testing.py

+9
Original file line numberDiff line numberDiff line change
@@ -875,3 +875,12 @@ def test_datapath_missing(datapath, request):
875875
)
876876

877877
assert result == expected
878+
879+
880+
def test_create_temp_directory():
881+
temppath = ''
882+
with tm.ensure_clean_dir() as path:
883+
assert os.path.exists(path)
884+
assert os.path.isdir(path)
885+
temppath = path
886+
assert not os.path.exists(temppath)

pandas/util/testing.py

+17
Original file line numberDiff line numberDiff line change
@@ -772,6 +772,23 @@ def ensure_clean(filename=None, return_filelike=False):
772772
print("Exception on removing file: {error}".format(error=e))
773773

774774

775+
@contextmanager
776+
def ensure_clean_dir():
777+
"""
778+
Get a temporary directory path and agrees to remove on close.
779+
780+
Yields
781+
----------
782+
Temporary directory path
783+
"""
784+
directory_name = tempfile.mkdtemp(suffix='')
785+
try:
786+
yield directory_name
787+
finally:
788+
import shutil
789+
shutil.rmtree(directory_name)
790+
791+
775792
# -----------------------------------------------------------------------------
776793
# Comparators
777794

0 commit comments

Comments
 (0)