Skip to content

Commit bdb7a16

Browse files
darguetajorisvandenbossche
authored andcommitted
ENH: Add support for excluding the index from Parquet files (GH20768) (#22266)
1 parent 4612a82 commit bdb7a16

File tree

5 files changed

+109
-11
lines changed

5 files changed

+109
-11
lines changed

doc/source/io.rst

+38
Original file line numberDiff line numberDiff line change
@@ -4570,6 +4570,9 @@ dtypes, including extension dtypes such as datetime with tz.
45704570
Several caveats.
45714571

45724572
* Duplicate column names and non-string columns names are not supported.
4573+
* The ``pyarrow`` engine always writes the index to the output, but ``fastparquet`` only writes non-default
4574+
indexes. This extra column can cause problems for non-Pandas consumers that are not expecting it. You can
4575+
force including or omitting indexes with the ``index`` argument, regardless of the underlying engine.
45734576
* Index level names, if specified, must be strings.
45744577
* Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype.
45754578
* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
@@ -4633,6 +4636,41 @@ Read only certain columns of a parquet file.
46334636
os.remove('example_pa.parquet')
46344637
os.remove('example_fp.parquet')
46354638
4639+
4640+
Handling Indexes
4641+
''''''''''''''''
4642+
4643+
Serializing a ``DataFrame`` to parquet may include the implicit index as one or
4644+
more columns in the output file. Thus, this code:
4645+
4646+
.. ipython:: python
4647+
4648+
df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
4649+
df.to_parquet('test.parquet', engine='pyarrow')
4650+
4651+
creates a parquet file with *three* columns if you use ``pyarrow`` for serialization:
4652+
``a``, ``b``, and ``__index_level_0__``. If you're using ``fastparquet``, the
4653+
index `may or may not <https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.write>`_
4654+
be written to the file.
4655+
4656+
This unexpected extra column causes some databases like Amazon Redshift to reject
4657+
the file, because that column doesn't exist in the target table.
4658+
4659+
If you want to omit a dataframe's indexes when writing, pass ``index=False`` to
4660+
:func:`~pandas.DataFrame.to_parquet`:
4661+
4662+
.. ipython:: python
4663+
4664+
df.to_parquet('test.parquet', index=False)
4665+
4666+
This creates a parquet file with just the two expected columns, ``a`` and ``b``.
4667+
If your ``DataFrame`` has a custom index, you won't get it back when you load
4668+
this file into a ``DataFrame``.
4669+
4670+
Passing ``index=True`` will *always* write the index, even if that's not the
4671+
underlying engine's default behavior.
4672+
4673+
46364674
.. _io.sql:
46374675

46384676
SQL Queries

doc/source/whatsnew/v0.24.0.txt

+4
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@ New features
1717

1818
- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
1919

20+
- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing
21+
the user to override the engine's default behavior to include or omit the
22+
dataframe's indexes from the resulting Parquet file. (:issue:`20768`)
23+
2024
.. _whatsnew_0240.enhancements.extension_array_operators:
2125

2226
``ExtensionArray`` operator support

pandas/core/frame.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -1902,7 +1902,7 @@ def to_feather(self, fname):
19021902
to_feather(self, fname)
19031903

19041904
def to_parquet(self, fname, engine='auto', compression='snappy',
1905-
**kwargs):
1905+
index=None, **kwargs):
19061906
"""
19071907
Write a DataFrame to the binary parquet format.
19081908
@@ -1924,6 +1924,13 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
19241924
'pyarrow' is unavailable.
19251925
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
19261926
Name of the compression to use. Use ``None`` for no compression.
1927+
index : bool, default None
1928+
If ``True``, include the dataframe's index(es) in the file output.
1929+
If ``False``, they will not be written to the file. If ``None``,
1930+
the behavior depends on the chosen engine.
1931+
1932+
.. versionadded:: 0.24.0
1933+
19271934
**kwargs
19281935
Additional arguments passed to the parquet library. See
19291936
:ref:`pandas io <io.parquet>` for more details.
@@ -1952,7 +1959,7 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
19521959
"""
19531960
from pandas.io.parquet import to_parquet
19541961
to_parquet(self, fname, engine,
1955-
compression=compression, **kwargs)
1962+
compression=compression, index=index, **kwargs)
19561963

19571964
@Substitution(header='Write out the column names. If a list of strings '
19581965
'is given, it is assumed to be aliases for the '

pandas/io/parquet.py

+24-9
Original file line numberDiff line numberDiff line change
@@ -103,19 +103,27 @@ def __init__(self):
103103
self.api = pyarrow
104104

105105
def write(self, df, path, compression='snappy',
106-
coerce_timestamps='ms', **kwargs):
106+
coerce_timestamps='ms', index=None, **kwargs):
107107
self.validate_dataframe(df)
108-
if self._pyarrow_lt_070:
108+
109+
# Only validate the index if we're writing it.
110+
if self._pyarrow_lt_070 and index is not False:
109111
self._validate_write_lt_070(df)
110112
path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
111113

114+
if index is None:
115+
from_pandas_kwargs = {}
116+
else:
117+
from_pandas_kwargs = {'preserve_index': index}
118+
112119
if self._pyarrow_lt_060:
113-
table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
120+
table = self.api.Table.from_pandas(df, timestamps_to_ms=True,
121+
**from_pandas_kwargs)
114122
self.api.parquet.write_table(
115123
table, path, compression=compression, **kwargs)
116124

117125
else:
118-
table = self.api.Table.from_pandas(df)
126+
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
119127
self.api.parquet.write_table(
120128
table, path, compression=compression,
121129
coerce_timestamps=coerce_timestamps, **kwargs)
@@ -197,7 +205,7 @@ def __init__(self):
197205
)
198206
self.api = fastparquet
199207

200-
def write(self, df, path, compression='snappy', **kwargs):
208+
def write(self, df, path, compression='snappy', index=None, **kwargs):
201209
self.validate_dataframe(df)
202210
# thriftpy/protocol/compact.py:339:
203211
# DeprecationWarning: tostring() is deprecated.
@@ -214,8 +222,8 @@ def write(self, df, path, compression='snappy', **kwargs):
214222
path, _, _, _ = get_filepath_or_buffer(path)
215223

216224
with catch_warnings(record=True):
217-
self.api.write(path, df,
218-
compression=compression, **kwargs)
225+
self.api.write(path, df, compression=compression,
226+
write_index=index, **kwargs)
219227

220228
def read(self, path, columns=None, **kwargs):
221229
if is_s3_url(path):
@@ -234,7 +242,8 @@ def read(self, path, columns=None, **kwargs):
234242
return parquet_file.to_pandas(columns=columns, **kwargs)
235243

236244

237-
def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
245+
def to_parquet(df, path, engine='auto', compression='snappy', index=None,
246+
**kwargs):
238247
"""
239248
Write a DataFrame to the parquet format.
240249
@@ -250,11 +259,17 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
250259
'pyarrow' is unavailable.
251260
compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
252261
Name of the compression to use. Use ``None`` for no compression.
262+
index : bool, default None
263+
If ``True``, include the dataframe's index(es) in the file output. If
264+
``False``, they will not be written to the file. If ``None``, the
265+
engine's default behavior will be used.
266+
267+
.. versionadded 0.24.0
253268
kwargs
254269
Additional keyword arguments passed to the engine
255270
"""
256271
impl = get_engine(engine)
257-
return impl.write(df, path, compression=compression, **kwargs)
272+
return impl.write(df, path, compression=compression, index=index, **kwargs)
258273

259274

260275
def read_parquet(path, engine='auto', columns=None, **kwargs):

pandas/tests/io/test_parquet.py

+34
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,40 @@ def test_multiindex_with_columns(self, pa_ge_070):
368368
check_round_trip(df, engine, read_kwargs={'columns': ['A', 'B']},
369369
expected=df[['A', 'B']])
370370

371+
def test_write_ignoring_index(self, engine):
372+
# ENH 20768
373+
# Ensure index=False omits the index from the written Parquet file.
374+
df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']})
375+
376+
write_kwargs = {
377+
'compression': None,
378+
'index': False,
379+
}
380+
381+
# Because we're dropping the index, we expect the loaded dataframe to
382+
# have the default integer index.
383+
expected = df.reset_index(drop=True)
384+
385+
check_round_trip(df, engine, write_kwargs=write_kwargs,
386+
expected=expected)
387+
388+
# Ignore custom index
389+
df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']},
390+
index=['zyx', 'wvu', 'tsr'])
391+
392+
check_round_trip(df, engine, write_kwargs=write_kwargs,
393+
expected=expected)
394+
395+
# Ignore multi-indexes as well.
396+
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
397+
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
398+
df = pd.DataFrame({'one': [i for i in range(8)],
399+
'two': [-i for i in range(8)]}, index=arrays)
400+
401+
expected = df.reset_index(drop=True)
402+
check_round_trip(df, engine, write_kwargs=write_kwargs,
403+
expected=expected)
404+
371405

372406
class TestParquetPyArrow(Base):
373407

0 commit comments

Comments
 (0)