Skip to content

Commit 4bf7f56

Browse files
committed
Allow non-default indexes in to_parquet.
...when supported by the underlying engine. Fixes pandas-dev#18581
1 parent 27a64b2 commit 4bf7f56

File tree

7 files changed

+181
-154
lines changed

7 files changed

+181
-154
lines changed

ci/requirements-2.7.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ source activate pandas
44

55
echo "install 27"
66

7-
conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1 fastparquet
7+
conda install -n pandas -c conda-forge feather-format pyarrow=0.7.0 fastparquet

ci/requirements-3.5.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@ echo "install 35"
88
conda remove -n pandas python-dateutil --force
99
pip install python-dateutil
1010

11-
conda install -n pandas -c conda-forge feather-format pyarrow=0.5.0
11+
conda install -n pandas -c conda-forge feather-format pyarrow=0.7.0

doc/source/install.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ Optional Dependencies
229229
* `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
230230
* `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended.
231231
* `Feather Format <https://github.com/wesm/feather>`__: necessary for feather-based storage, version 0.3.1 or higher.
232-
* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1) or `fastparquet <https://fastparquet.readthedocs.io/en/latest/necessary>`__ (>= 0.0.6) for parquet-based storage. The `snappy <https://pypi.python.org/pypi/python-snappy>`__ and `brotli <https://pypi.python.org/pypi/brotlipy>`__ are available for compression support.
232+
* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.7.0) or `fastparquet <https://fastparquet.readthedocs.io/en/latest/necessary>`__ (>= 0.1.0) for parquet-based storage. The `snappy <https://pypi.python.org/pypi/python-snappy>`__ and `brotli <https://pypi.python.org/pypi/brotlipy>`__ are available for compression support.
233233
* `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are:
234234

235235
* `psycopg2 <http://initd.org/psycopg/>`__: for PostgreSQL

doc/source/io.rst

+1-4
Original file line numberDiff line numberDiff line change
@@ -4504,11 +4504,8 @@ dtypes, including extension dtypes such as datetime with tz.
45044504

45054505
Several caveats.
45064506

4507-
- The format will NOT write an ``Index``, or ``MultiIndex`` for the
4508-
``DataFrame`` and will raise an error if a non-default one is provided. You
4509-
can ``.reset_index()`` to store the index or ``.reset_index(drop=True)`` to
4510-
ignore it.
45114507
- Duplicate column names and non-string columns names are not supported
4508+
- Index level names, if specified, must be strings
45124509
- Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype.
45134510
- Non supported types include ``Period`` and actual python object types. These will raise a helpful error message
45144511
on an attempt at serialization.

doc/source/whatsnew/v0.22.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -326,4 +326,4 @@ Other
326326
^^^^^
327327

328328
- Improved error message when attempting to use a Python keyword as an identifier in a ``numexpr`` backed query (:issue:`18221`)
329-
-
329+
-

pandas/io/parquet.py

+124-88
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
from warnings import catch_warnings
44
from distutils.version import LooseVersion
55
from pandas import DataFrame, RangeIndex, Int64Index, get_option
6-
from pandas.compat import range
6+
from pandas.compat import range, string_types
7+
from pandas.core.common import AbstractMethodError
78
from pandas.io.common import get_filepath_or_buffer
89

910

@@ -34,82 +35,152 @@ def get_engine(engine):
3435
return FastParquetImpl()
3536

3637

37-
class PyArrowImpl(object):
38+
class BaseImpl(object):
39+
40+
api = None # module
41+
42+
@staticmethod
43+
def validate_dataframe(df):
44+
if not isinstance(df, DataFrame):
45+
raise ValueError("to_parquet only supports IO with DataFrames")
46+
# must have value column names (strings only)
47+
if df.columns.inferred_type not in {'string', 'unicode'}:
48+
raise ValueError("parquet must have string column names")
49+
# index level names must be strings
50+
valid_names = all(
51+
isinstance(name, string_types)
52+
for name in df.index.names
53+
if name is not None
54+
)
55+
if not valid_names:
56+
raise ValueError("Index level names must be strings")
57+
58+
def write(self, df, path, compression, **kwargs):
59+
raise AbstractMethodError(self)
60+
61+
def read(self, path, columns=None, **kwargs):
62+
raise AbstractMethodError(self)
63+
64+
65+
class PyArrowImpl(BaseImpl):
3866

3967
def __init__(self):
4068
# since pandas is a dependency of pyarrow
4169
# we need to import on first use
42-
4370
try:
4471
import pyarrow
4572
import pyarrow.parquet
4673
except ImportError:
47-
raise ImportError("pyarrow is required for parquet support\n\n"
48-
"you can install via conda\n"
49-
"conda install pyarrow -c conda-forge\n"
50-
"\nor via pip\n"
51-
"pip install -U pyarrow\n")
52-
53-
if LooseVersion(pyarrow.__version__) < LooseVersion('0.4.1'):
54-
raise ImportError("pyarrow >= 0.4.1 is required for parquet"
55-
"support\n\n"
56-
"you can install via conda\n"
57-
"conda install pyarrow -c conda-forge\n"
58-
"\nor via pip\n"
59-
"pip install -U pyarrow\n")
60-
61-
self._pyarrow_lt_050 = (LooseVersion(pyarrow.__version__) <
62-
LooseVersion('0.5.0'))
63-
self._pyarrow_lt_060 = (LooseVersion(pyarrow.__version__) <
64-
LooseVersion('0.6.0'))
74+
raise ImportError(
75+
"pyarrow is required for parquet support\n\n"
76+
"you can install via conda\n"
77+
"conda install pyarrow -c conda-forge\n"
78+
"\nor via pip\n"
79+
"pip install -U pyarrow\n"
80+
)
81+
if LooseVersion(pyarrow.__version__) < '0.4.1':
82+
raise ImportError(
83+
"pyarrow >= 0.4.1 is required for parquet support\n\n"
84+
"you can install via conda\n"
85+
"conda install pyarrow -c conda-forge\n"
86+
"\nor via pip\n"
87+
"pip install -U pyarrow\n"
88+
)
89+
self._pyarrow_lt_070 = (
90+
LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0')
91+
)
6592
self.api = pyarrow
6693

6794
def write(self, df, path, compression='snappy',
6895
coerce_timestamps='ms', **kwargs):
96+
self.validate_dataframe(df)
97+
if self._pyarrow_lt_070:
98+
self._validate_write_lt_070(
99+
df, path, compression, coerce_timestamps, **kwargs
100+
)
69101
path, _, _ = get_filepath_or_buffer(path)
70-
if self._pyarrow_lt_060:
71-
table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
72-
self.api.parquet.write_table(
73-
table, path, compression=compression, **kwargs)
74-
75-
else:
76-
table = self.api.Table.from_pandas(df)
77-
self.api.parquet.write_table(
78-
table, path, compression=compression,
79-
coerce_timestamps=coerce_timestamps, **kwargs)
102+
table = self.api.Table.from_pandas(df)
103+
self.api.parquet.write_table(
104+
table, path, compression=compression,
105+
coerce_timestamps=coerce_timestamps, **kwargs)
80106

81107
def read(self, path, columns=None, **kwargs):
82108
path, _, _ = get_filepath_or_buffer(path)
83-
return self.api.parquet.read_table(path, columns=columns,
84-
**kwargs).to_pandas()
85-
86-
87-
class FastParquetImpl(object):
109+
parquet_file = self.api.parquet.ParquetFile(path)
110+
if self._pyarrow_lt_070:
111+
parquet_file.path = path
112+
return self._read_lt_070(parquet_file, columns, **kwargs)
113+
kwargs['use_pandas_metadata'] = True
114+
return parquet_file.read(columns=columns, **kwargs).to_pandas()
115+
116+
117+
def _validate_write_lt_070(self, df, path, compression='snappy',
118+
coerce_timestamps='ms', **kwargs):
119+
# Compatibility shim for pyarrow < 0.7.0
120+
# TODO: Remove in pandas 0.22.0
121+
from pandas.core.indexes.multi import MultiIndex
122+
if isinstance(df.index, MultiIndex):
123+
msg = "Mulit-index DataFrames are only supported with pyarrow >= 0.7.0"
124+
raise ValueError(msg)
125+
# Validate index
126+
if not isinstance(df.index, Int64Index):
127+
msg = (
128+
"parquet does not support serializing {} for the index;"
129+
"you can .reset_index() to make the index into column(s)"
130+
)
131+
raise ValueError(msg.format(type(df.index)))
132+
if not df.index.equals(RangeIndex(len(df))):
133+
raise ValueError(
134+
"parquet does not support serializing a non-default index "
135+
"for the index; you can .reset_index() to make the index "
136+
"into column(s)"
137+
)
138+
if df.index.name is not None:
139+
raise ValueError(
140+
"parquet does not serialize index meta-data "
141+
"on a default index"
142+
)
143+
144+
def _read_lt_070(self, parquet_file, columns, **kwargs):
145+
# Compatibility shim for pyarrow < 0.7.0
146+
# TODO: Remove in pandas 0.22.0
147+
from itertools import chain
148+
import json
149+
if columns is not None:
150+
metadata = json.loads(parquet_file.metadata.metadata[b'pandas'])
151+
columns = set(chain(columns, metadata['index_columns']))
152+
kwargs['columns'] = columns
153+
return self.api.parquet.read_table(parquet_file.path, **kwargs).to_pandas()
154+
155+
156+
class FastParquetImpl(BaseImpl):
88157

89158
def __init__(self):
90159
# since pandas is a dependency of fastparquet
91160
# we need to import on first use
92-
93161
try:
94162
import fastparquet
95163
except ImportError:
96-
raise ImportError("fastparquet is required for parquet support\n\n"
97-
"you can install via conda\n"
98-
"conda install fastparquet -c conda-forge\n"
99-
"\nor via pip\n"
100-
"pip install -U fastparquet")
101-
102-
if LooseVersion(fastparquet.__version__) < LooseVersion('0.1.0'):
103-
raise ImportError("fastparquet >= 0.1.0 is required for parquet "
104-
"support\n\n"
105-
"you can install via conda\n"
106-
"conda install fastparquet -c conda-forge\n"
107-
"\nor via pip\n"
108-
"pip install -U fastparquet")
109-
164+
raise ImportError(
165+
"fastparquet is required for parquet support\n\n"
166+
"you can install via conda\n"
167+
"conda install fastparquet -c conda-forge\n"
168+
"\nor via pip\n"
169+
"pip install -U fastparquet"
170+
)
171+
if LooseVersion(fastparquet.__version__) < '0.1.0':
172+
raise ImportError(
173+
"fastparquet >= 0.1.0 is required for parquet "
174+
"support\n\n"
175+
"you can install via conda\n"
176+
"conda install fastparquet -c conda-forge\n"
177+
"\nor via pip\n"
178+
"pip install -U fastparquet"
179+
)
110180
self.api = fastparquet
111181

112182
def write(self, df, path, compression='snappy', **kwargs):
183+
self.validate_dataframe(df)
113184
# thriftpy/protocol/compact.py:339:
114185
# DeprecationWarning: tostring() is deprecated.
115186
# Use tobytes() instead.
@@ -120,7 +191,8 @@ def write(self, df, path, compression='snappy', **kwargs):
120191

121192
def read(self, path, columns=None, **kwargs):
122193
path, _, _ = get_filepath_or_buffer(path)
123-
return self.api.ParquetFile(path).to_pandas(columns=columns, **kwargs)
194+
parquet_file = self.api.ParquetFile(path)
195+
return parquet_file.to_pandas(columns=columns, **kwargs)
124196

125197

126198
def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
@@ -141,43 +213,7 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
141213
kwargs
142214
Additional keyword arguments passed to the engine
143215
"""
144-
145216
impl = get_engine(engine)
146-
147-
if not isinstance(df, DataFrame):
148-
raise ValueError("to_parquet only support IO with DataFrames")
149-
150-
valid_types = {'string', 'unicode'}
151-
152-
# validate index
153-
# --------------
154-
155-
# validate that we have only a default index
156-
# raise on anything else as we don't serialize the index
157-
158-
if not isinstance(df.index, Int64Index):
159-
raise ValueError("parquet does not support serializing {} "
160-
"for the index; you can .reset_index()"
161-
"to make the index into column(s)".format(
162-
type(df.index)))
163-
164-
if not df.index.equals(RangeIndex.from_range(range(len(df)))):
165-
raise ValueError("parquet does not support serializing a "
166-
"non-default index for the index; you "
167-
"can .reset_index() to make the index "
168-
"into column(s)")
169-
170-
if df.index.name is not None:
171-
raise ValueError("parquet does not serialize index meta-data on a "
172-
"default index")
173-
174-
# validate columns
175-
# ----------------
176-
177-
# must have value column names (strings only)
178-
if df.columns.inferred_type not in valid_types:
179-
raise ValueError("parquet must have string column names")
180-
181217
return impl.write(df, path, compression=compression, **kwargs)
182218

183219

0 commit comments

Comments
 (0)