Skip to content

Commit 8d7e876

Browse files
dhirschfeldjorisvandenbossche
authored andcommitted
ENH: support non default indexes in writing to Parquet (#18629)
fastparquet automatically names an index 'index' if it doesn't already have a name
1 parent 2aa4aa9 commit 8d7e876

File tree

4 files changed

+191
-116
lines changed

4 files changed

+191
-116
lines changed

doc/source/io.rst

+1-4
Original file line numberDiff line numberDiff line change
@@ -4504,11 +4504,8 @@ dtypes, including extension dtypes such as datetime with tz.
45044504

45054505
Several caveats.
45064506

4507-
- The format will NOT write an ``Index``, or ``MultiIndex`` for the
4508-
``DataFrame`` and will raise an error if a non-default one is provided. You
4509-
can ``.reset_index()`` to store the index or ``.reset_index(drop=True)`` to
4510-
ignore it.
45114507
- Duplicate column names and non-string columns names are not supported
4508+
- Index level names, if specified, must be strings
45124509
- Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype.
45134510
- Non supported types include ``Period`` and actual python object types. These will raise a helpful error message
45144511
on an attempt at serialization.

doc/source/whatsnew/v0.21.1.txt

+3-2
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ New features
6161
Improvements to the Parquet IO functionality
6262
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
6363

64+
- :func:`DataFrame.to_parquet` will now write non-default indexes when the
65+
underlying engine supports it. The indexes will be preserved when reading
66+
back in with :func:`read_parquet` (:issue:`18581`).
6467
- :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`)
6568
- :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`)
6669

@@ -91,8 +94,6 @@ Performance Improvements
9194

9295
Bug Fixes
9396
~~~~~~~~~
94-
-
95-
9697

9798
Conversion
9899
^^^^^^^^^^

pandas/io/parquet.py

+116-75
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
from warnings import catch_warnings
44
from distutils.version import LooseVersion
55
from pandas import DataFrame, RangeIndex, Int64Index, get_option
6-
from pandas.compat import range
6+
from pandas.compat import string_types
7+
from pandas.core.common import AbstractMethodError
78
from pandas.io.common import get_filepath_or_buffer
89

910

@@ -39,39 +40,75 @@ def get_engine(engine):
3940
return FastParquetImpl()
4041

4142

42-
class PyArrowImpl(object):
43+
class BaseImpl(object):
44+
45+
api = None # module
46+
47+
@staticmethod
48+
def validate_dataframe(df):
49+
50+
if not isinstance(df, DataFrame):
51+
raise ValueError("to_parquet only supports IO with DataFrames")
52+
53+
# must have value column names (strings only)
54+
if df.columns.inferred_type not in {'string', 'unicode'}:
55+
raise ValueError("parquet must have string column names")
56+
57+
# index level names must be strings
58+
valid_names = all(
59+
isinstance(name, string_types)
60+
for name in df.index.names
61+
if name is not None
62+
)
63+
if not valid_names:
64+
raise ValueError("Index level names must be strings")
65+
66+
def write(self, df, path, compression, **kwargs):
67+
raise AbstractMethodError(self)
68+
69+
def read(self, path, columns=None, **kwargs):
70+
raise AbstractMethodError(self)
71+
72+
73+
class PyArrowImpl(BaseImpl):
4374

4475
def __init__(self):
4576
# since pandas is a dependency of pyarrow
4677
# we need to import on first use
47-
4878
try:
4979
import pyarrow
5080
import pyarrow.parquet
5181
except ImportError:
52-
raise ImportError("pyarrow is required for parquet support\n\n"
53-
"you can install via conda\n"
54-
"conda install pyarrow -c conda-forge\n"
55-
"\nor via pip\n"
56-
"pip install -U pyarrow\n")
57-
58-
if LooseVersion(pyarrow.__version__) < LooseVersion('0.4.1'):
59-
raise ImportError("pyarrow >= 0.4.1 is required for parquet"
60-
"support\n\n"
61-
"you can install via conda\n"
62-
"conda install pyarrow -c conda-forge\n"
63-
"\nor via pip\n"
64-
"pip install -U pyarrow\n")
65-
66-
self._pyarrow_lt_050 = (LooseVersion(pyarrow.__version__) <
67-
LooseVersion('0.5.0'))
68-
self._pyarrow_lt_060 = (LooseVersion(pyarrow.__version__) <
69-
LooseVersion('0.6.0'))
82+
raise ImportError(
83+
"pyarrow is required for parquet support\n\n"
84+
"you can install via conda\n"
85+
"conda install pyarrow -c conda-forge\n"
86+
"\nor via pip\n"
87+
"pip install -U pyarrow\n"
88+
)
89+
if LooseVersion(pyarrow.__version__) < '0.4.1':
90+
raise ImportError(
91+
"pyarrow >= 0.4.1 is required for parquet support\n\n"
92+
"you can install via conda\n"
93+
"conda install pyarrow -c conda-forge\n"
94+
"\nor via pip\n"
95+
"pip install -U pyarrow\n"
96+
)
97+
98+
self._pyarrow_lt_060 = (
99+
LooseVersion(pyarrow.__version__) < LooseVersion('0.6.0'))
100+
self._pyarrow_lt_070 = (
101+
LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'))
102+
70103
self.api = pyarrow
71104

72105
def write(self, df, path, compression='snappy',
73106
coerce_timestamps='ms', **kwargs):
107+
self.validate_dataframe(df)
108+
if self._pyarrow_lt_070:
109+
self._validate_write_lt_070(df)
74110
path, _, _ = get_filepath_or_buffer(path)
111+
75112
if self._pyarrow_lt_060:
76113
table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
77114
self.api.parquet.write_table(
@@ -85,36 +122,75 @@ def write(self, df, path, compression='snappy',
85122

86123
def read(self, path, columns=None, **kwargs):
87124
path, _, _ = get_filepath_or_buffer(path)
125+
if self._pyarrow_lt_070:
126+
return self.api.parquet.read_pandas(path, columns=columns,
127+
**kwargs).to_pandas()
128+
kwargs['use_pandas_metadata'] = True
88129
return self.api.parquet.read_table(path, columns=columns,
89130
**kwargs).to_pandas()
90131

91-
92-
class FastParquetImpl(object):
132+
def _validate_write_lt_070(self, df):
133+
# Compatibility shim for pyarrow < 0.7.0
134+
# TODO: Remove in pandas 0.22.0
135+
from pandas.core.indexes.multi import MultiIndex
136+
if isinstance(df.index, MultiIndex):
137+
msg = (
138+
"Multi-index DataFrames are only supported "
139+
"with pyarrow >= 0.7.0"
140+
)
141+
raise ValueError(msg)
142+
# Validate index
143+
if not isinstance(df.index, Int64Index):
144+
msg = (
145+
"pyarrow < 0.7.0 does not support serializing {} for the "
146+
"index; you can .reset_index() to make the index into "
147+
"column(s), or install the latest version of pyarrow or "
148+
"fastparquet."
149+
)
150+
raise ValueError(msg.format(type(df.index)))
151+
if not df.index.equals(RangeIndex(len(df))):
152+
raise ValueError(
153+
"pyarrow < 0.7.0 does not support serializing a non-default "
154+
"index; you can .reset_index() to make the index into "
155+
"column(s), or install the latest version of pyarrow or "
156+
"fastparquet."
157+
)
158+
if df.index.name is not None:
159+
raise ValueError(
160+
"pyarrow < 0.7.0 does not serialize indexes with a name; you "
161+
"can set the index.name to None or install the latest version "
162+
"of pyarrow or fastparquet."
163+
)
164+
165+
166+
class FastParquetImpl(BaseImpl):
93167

94168
def __init__(self):
95169
# since pandas is a dependency of fastparquet
96170
# we need to import on first use
97-
98171
try:
99172
import fastparquet
100173
except ImportError:
101-
raise ImportError("fastparquet is required for parquet support\n\n"
102-
"you can install via conda\n"
103-
"conda install fastparquet -c conda-forge\n"
104-
"\nor via pip\n"
105-
"pip install -U fastparquet")
106-
107-
if LooseVersion(fastparquet.__version__) < LooseVersion('0.1.0'):
108-
raise ImportError("fastparquet >= 0.1.0 is required for parquet "
109-
"support\n\n"
110-
"you can install via conda\n"
111-
"conda install fastparquet -c conda-forge\n"
112-
"\nor via pip\n"
113-
"pip install -U fastparquet")
114-
174+
raise ImportError(
175+
"fastparquet is required for parquet support\n\n"
176+
"you can install via conda\n"
177+
"conda install fastparquet -c conda-forge\n"
178+
"\nor via pip\n"
179+
"pip install -U fastparquet"
180+
)
181+
if LooseVersion(fastparquet.__version__) < '0.1.0':
182+
raise ImportError(
183+
"fastparquet >= 0.1.0 is required for parquet "
184+
"support\n\n"
185+
"you can install via conda\n"
186+
"conda install fastparquet -c conda-forge\n"
187+
"\nor via pip\n"
188+
"pip install -U fastparquet"
189+
)
115190
self.api = fastparquet
116191

117192
def write(self, df, path, compression='snappy', **kwargs):
193+
self.validate_dataframe(df)
118194
# thriftpy/protocol/compact.py:339:
119195
# DeprecationWarning: tostring() is deprecated.
120196
# Use tobytes() instead.
@@ -125,7 +201,8 @@ def write(self, df, path, compression='snappy', **kwargs):
125201

126202
def read(self, path, columns=None, **kwargs):
127203
path, _, _ = get_filepath_or_buffer(path)
128-
return self.api.ParquetFile(path).to_pandas(columns=columns, **kwargs)
204+
parquet_file = self.api.ParquetFile(path)
205+
return parquet_file.to_pandas(columns=columns, **kwargs)
129206

130207

131208
def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
@@ -146,43 +223,7 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
146223
kwargs
147224
Additional keyword arguments passed to the engine
148225
"""
149-
150226
impl = get_engine(engine)
151-
152-
if not isinstance(df, DataFrame):
153-
raise ValueError("to_parquet only support IO with DataFrames")
154-
155-
valid_types = {'string', 'unicode'}
156-
157-
# validate index
158-
# --------------
159-
160-
# validate that we have only a default index
161-
# raise on anything else as we don't serialize the index
162-
163-
if not isinstance(df.index, Int64Index):
164-
raise ValueError("parquet does not support serializing {} "
165-
"for the index; you can .reset_index()"
166-
"to make the index into column(s)".format(
167-
type(df.index)))
168-
169-
if not df.index.equals(RangeIndex.from_range(range(len(df)))):
170-
raise ValueError("parquet does not support serializing a "
171-
"non-default index for the index; you "
172-
"can .reset_index() to make the index "
173-
"into column(s)")
174-
175-
if df.index.name is not None:
176-
raise ValueError("parquet does not serialize index meta-data on a "
177-
"default index")
178-
179-
# validate columns
180-
# ----------------
181-
182-
# must have value column names (strings only)
183-
if df.columns.inferred_type not in valid_types:
184-
raise ValueError("parquet must have string column names")
185-
186227
return impl.write(df, path, compression=compression, **kwargs)
187228

188229

0 commit comments

Comments
 (0)