Skip to content

Commit bb87d0d

Browse files
committed
more flexibile io.parquet.engine
1 parent d3ec8b5 commit bb87d0d

File tree

6 files changed

+73
-22
lines changed

6 files changed

+73
-22
lines changed

doc/source/io.rst

+4-1
Original file line numberDiff line numberDiff line change
@@ -4576,7 +4576,10 @@ Several caveats.
45764576
- Non supported types include ``Period`` and actual python object types. These will raise a helpful error message
45774577
on an attempt at serialization.
45784578

4579-
You can specifiy an ``engine`` to direct the serialization, defaulting to ``pyarrow`` and controlled by the ``pd.options.io.parquet``.
4579+
You can specifiy an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``.
4580+
If the engine is NOT specified, then the ``pd.options.io.parquet.engine`` option is checked; if this is also ``auto``, then
4581+
then ``pyarrow`` is tried, and falling back to ``fastparquet``.
4582+
45804583
See the documentation for `pyarrow <http://arrow.apache.org/docs/python/`__ and `fastparquet <https://fastparquet.readthedocs.io/en/latest/>`__
45814584

45824585
.. note::

doc/source/options.rst

+3-2
Original file line numberDiff line numberDiff line change
@@ -414,8 +414,9 @@ io.hdf.default_format None default format writing format,
414414
'table'
415415
io.hdf.dropna_table True drop ALL nan rows when appending
416416
to a table
417-
io.parquet.engine pyarrow The engine to use as a default for
418-
parquet reading and writing.
417+
io.parquet.engine None The engine to use as a default for
418+
parquet reading and writing. If None
419+
then try 'pyarrow' and 'fastparquet'
419420
mode.chained_assignment warn Raise an exception, warn, or no
420421
action if trying to use chained
421422
assignment, The default is warn

pandas/core/config_init.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -467,9 +467,10 @@ def _register_xlsx(engine, other):
467467
parquet_engine_doc = """
468468
: string
469469
The default parquet reader/writer engine. Available options:
470-
'pyarrow', 'fastparquet', the default is 'pyarrow'
470+
'auto', 'pyarrow', 'fastparquet', the default is 'auto'
471471
"""
472472

473473
with cf.config_prefix('io.parquet'):
474-
cf.register_option('engine', 'pyarrow', parquet_engine_doc,
475-
validator=is_one_of_factory(['pyarrow', 'fastparquet']))
474+
cf.register_option(
475+
'engine', 'auto', parquet_engine_doc,
476+
validator=is_one_of_factory(['auto', 'pyarrow', 'fastparquet']))

pandas/core/frame.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -1598,7 +1598,7 @@ def to_feather(self, fname):
15981598
from pandas.io.feather_format import to_feather
15991599
to_feather(self, fname)
16001600

1601-
def to_parquet(self, fname, engine=None, compression='snappy',
1601+
def to_parquet(self, fname, engine='auto', compression='snappy',
16021602
**kwargs):
16031603
"""
16041604
Write a DataFrame to the binary parquet format.
@@ -1609,10 +1609,10 @@ def to_parquet(self, fname, engine=None, compression='snappy',
16091609
----------
16101610
fname : str
16111611
string file path
1612-
engine : str, optional
1613-
The parquet engine, one of {'pyarrow', 'fastparquet'}
1614-
If None, will use the option: `io.parquet.engine`, which
1615-
defaults to 'pyarrow'
1612+
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
1613+
Parquet reader library to use. If 'auto', then the option
1614+
'io.parquet.engine' is used. If 'auto', then the first
1615+
library to be installed is used.
16161616
compression : str, optional, default 'snappy'
16171617
compression method, includes {'gzip', 'snappy', 'brotli'}
16181618
kwargs

pandas/io/parquet.py

+23-10
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,21 @@
1010
def get_engine(engine):
1111
""" return our implementation """
1212

13-
if engine is None:
13+
if engine is 'auto':
1414
engine = get_option('io.parquet.engine')
1515

16+
if engine is 'auto':
17+
# try engines in this order
18+
try:
19+
return PyArrowImpl()
20+
except ImportError:
21+
pass
22+
23+
try:
24+
return FastParquetImpl()
25+
except ImportError:
26+
pass
27+
1628
if engine not in ['pyarrow', 'fastparquet']:
1729
raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
1830

@@ -98,7 +110,7 @@ def read(self, path):
98110
return self.api.ParquetFile(path).to_pandas()
99111

100112

101-
def to_parquet(df, path, engine=None, compression='snappy', **kwargs):
113+
def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
102114
"""
103115
Write a DataFrame to the parquet format.
104116
@@ -107,10 +119,10 @@ def to_parquet(df, path, engine=None, compression='snappy', **kwargs):
107119
df : DataFrame
108120
path : string
109121
File path
110-
engine : str, optional
111-
The parquet engine, one of {'pyarrow', 'fastparquet'}
112-
If None, will use the option: `io.parquet.engine`, which
113-
defaults to 'pyarrow'
122+
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
123+
Parquet reader library to use. If 'auto', then the option
124+
'io.parquet.engine' is used. If 'auto', then the first
125+
library to be installed is used.
114126
compression : str, optional, default 'snappy'
115127
compression method, includes {'gzip', 'snappy', 'brotli'}
116128
kwargs
@@ -156,7 +168,7 @@ def to_parquet(df, path, engine=None, compression='snappy', **kwargs):
156168
return impl.write(df, path, compression=compression)
157169

158170

159-
def read_parquet(path, engine=None, **kwargs):
171+
def read_parquet(path, engine='auto', **kwargs):
160172
"""
161173
Load a parquet object from the file path, returning a DataFrame.
162174
@@ -166,9 +178,10 @@ def read_parquet(path, engine=None, **kwargs):
166178
----------
167179
path : string
168180
File path
169-
engine : str, optional
170-
The parquet engine, one of {'pyarrow', 'fastparquet'}
171-
if None, will use the option: `io.parquet.engine`
181+
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
182+
Parquet reader library to use. If 'auto', then the option
183+
'io.parquet.engine' is used. If 'auto', then the first
184+
library to be installed is used.
172185
kwargs are passed to the engine
173186
174187
Returns

pandas/tests/io/test_parquet.py

+34-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
import numpy as np
88
import pandas as pd
99
from pandas.compat import PY3, is_platform_windows
10-
from pandas.io.parquet import to_parquet, read_parquet
10+
from pandas.io.parquet import (to_parquet, read_parquet, get_engine,
11+
PyArrowImpl, FastParquetImpl)
1112
from pandas.util import testing as tm
1213

1314
try:
@@ -100,6 +101,38 @@ def test_options_fp(df_compat, fp):
100101
tm.assert_frame_equal(result, df)
101102

102103

104+
def test_options_auto(df_compat, fp, pa):
105+
106+
df = df_compat
107+
with tm.ensure_clean() as path:
108+
109+
with pd.option_context('io.parquet.engine', 'auto'):
110+
df.to_parquet(path)
111+
112+
result = read_parquet(path, compression=None)
113+
tm.assert_frame_equal(result, df)
114+
115+
116+
def test_options_get_engine(fp, pa):
117+
assert isinstance(get_engine('pyarrow'), PyArrowImpl)
118+
assert isinstance(get_engine('fastparquet'), FastParquetImpl)
119+
120+
with pd.option_context('io.parquet.engine', 'pyarrow'):
121+
assert isinstance(get_engine('auto'), PyArrowImpl)
122+
assert isinstance(get_engine('pyarrow'), PyArrowImpl)
123+
assert isinstance(get_engine('fastparquet'), FastParquetImpl)
124+
125+
with pd.option_context('io.parquet.engine', 'fastparquet'):
126+
assert isinstance(get_engine('auto'), FastParquetImpl)
127+
assert isinstance(get_engine('pyarrow'), PyArrowImpl)
128+
assert isinstance(get_engine('fastparquet'), FastParquetImpl)
129+
130+
with pd.option_context('io.parquet.engine', 'auto'):
131+
assert isinstance(get_engine('auto'), PyArrowImpl)
132+
assert isinstance(get_engine('pyarrow'), PyArrowImpl)
133+
assert isinstance(get_engine('fastparquet'), FastParquetImpl)
134+
135+
103136
@pytest.mark.xfail(reason="fp does not ignore pa index __index_level_0__")
104137
def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
105138
# cross-compat with differing reading/writing engines

0 commit comments

Comments
 (0)