ENH: Add support for excluding the index from Parquet files (GH20768) (pandas-dev#22266)

dargueta · jorisvandenbossche · commit bdb7a1603f1e · 2018-09-21T10:17:59.000+02:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -4570,6 +4570,9 @@ dtypes, including extension dtypes such as datetime with tz.
 Several caveats.
 
 * Duplicate column names and non-string columns names are not supported.
+* The ``pyarrow`` engine always writes the index to the output, but ``fastparquet`` only writes non-default
+  indexes. This extra column can cause problems for non-Pandas consumers that are not expecting it. You can
+  force including or omitting indexes with the ``index`` argument, regardless of the underlying engine.
 * Index level names, if specified, must be strings.
 * Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype.
 * Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
@@ -4633,6 +4636,41 @@ Read only certain columns of a parquet file.
    os.remove('example_pa.parquet')
    os.remove('example_fp.parquet')
 
+
+Handling Indexes
+''''''''''''''''
+
+Serializing a ``DataFrame`` to parquet may include the implicit index as one or
+more columns in the output file. Thus, this code:
+
+.. ipython:: python
+
+    df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
+    df.to_parquet('test.parquet', engine='pyarrow')
+
+creates a parquet file with *three* columns if you use ``pyarrow`` for serialization:
+``a``, ``b``, and ``__index_level_0__``. If you're using ``fastparquet``, the
+index `may or may not <https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.write>`_
+be written to the file.
+
+This unexpected extra column causes some databases like Amazon Redshift to reject
+the file, because that column doesn't exist in the target table.
+
+If you want to omit a dataframe's indexes when writing, pass ``index=False`` to
+:func:`~pandas.DataFrame.to_parquet`:
+
+.. ipython:: python
+
+    df.to_parquet('test.parquet', index=False)
+
+This creates a parquet file with just the two expected columns, ``a`` and ``b``.
+If your ``DataFrame`` has a custom index, you won't get it back when you load
+this file into a ``DataFrame``.
+
+Passing ``index=True`` will *always* write the index, even if that's not the
+underlying engine's default behavior.
+
+
 .. _io.sql:
 
 SQL Queries
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -17,6 +17,10 @@ New features
 
 - ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
 
+- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing
+the user to override the engine's default behavior to include or omit the
+dataframe's indexes from the resulting Parquet file. (:issue:`20768`)
+
 .. _whatsnew_0240.enhancements.extension_array_operators:
 
 ``ExtensionArray`` operator support
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1902,7 +1902,7 @@ def to_feather(self, fname):
         to_feather(self, fname)
 
     def to_parquet(self, fname, engine='auto', compression='snappy',
-                   **kwargs):
+                   index=None, **kwargs):
         """
         Write a DataFrame to the binary parquet format.
 
@@ -1924,6 +1924,13 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
             'pyarrow' is unavailable.
         compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
             Name of the compression to use. Use ``None`` for no compression.
+        index : bool, default None
+            If ``True``, include the dataframe's index(es) in the file output.
+            If ``False``, they will not be written to the file. If ``None``,
+            the behavior depends on the chosen engine.
+
+            .. versionadded:: 0.24.0
+
         **kwargs
             Additional arguments passed to the parquet library. See
             :ref:`pandas io <io.parquet>` for more details.
@@ -1952,7 +1959,7 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
         """
         from pandas.io.parquet import to_parquet
         to_parquet(self, fname, engine,
-                   compression=compression, **kwargs)
+                   compression=compression, index=index, **kwargs)
 
     @Substitution(header='Write out the column names. If a list of strings '
                          'is given, it is assumed to be aliases for the '
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -103,19 +103,27 @@ def __init__(self):
         self.api = pyarrow
 
     def write(self, df, path, compression='snappy',
-              coerce_timestamps='ms', **kwargs):
+              coerce_timestamps='ms', index=None, **kwargs):
         self.validate_dataframe(df)
-        if self._pyarrow_lt_070:
+
+        # Only validate the index if we're writing it.
+        if self._pyarrow_lt_070 and index is not False:
             self._validate_write_lt_070(df)
         path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
 
+        if index is None:
+            from_pandas_kwargs = {}
+        else:
+            from_pandas_kwargs = {'preserve_index': index}
+
         if self._pyarrow_lt_060:
-            table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
+            table = self.api.Table.from_pandas(df, timestamps_to_ms=True,
+                                               **from_pandas_kwargs)
             self.api.parquet.write_table(
                 table, path, compression=compression, **kwargs)
 
         else:
-            table = self.api.Table.from_pandas(df)
+            table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
             self.api.parquet.write_table(
                 table, path, compression=compression,
                 coerce_timestamps=coerce_timestamps, **kwargs)
@@ -197,7 +205,7 @@ def __init__(self):
             )
         self.api = fastparquet
 
-    def write(self, df, path, compression='snappy', **kwargs):
+    def write(self, df, path, compression='snappy', index=None, **kwargs):
         self.validate_dataframe(df)
         # thriftpy/protocol/compact.py:339:
         # DeprecationWarning: tostring() is deprecated.
@@ -214,8 +222,8 @@ def write(self, df, path, compression='snappy', **kwargs):
             path, _, _, _ = get_filepath_or_buffer(path)
 
         with catch_warnings(record=True):
-            self.api.write(path, df,
-                           compression=compression, **kwargs)
+            self.api.write(path, df, compression=compression,
+                           write_index=index, **kwargs)
 
     def read(self, path, columns=None, **kwargs):
         if is_s3_url(path):
@@ -234,7 +242,8 @@ def read(self, path, columns=None, **kwargs):
         return parquet_file.to_pandas(columns=columns, **kwargs)
 
 
-def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
+def to_parquet(df, path, engine='auto', compression='snappy', index=None,
+               **kwargs):
     """
     Write a DataFrame to the parquet format.
 
@@ -250,11 +259,17 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs):
         'pyarrow' is unavailable.
     compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
         Name of the compression to use. Use ``None`` for no compression.
+    index : bool, default None
+        If ``True``, include the dataframe's index(es) in the file output. If
+        ``False``, they will not be written to the file. If ``None``, the
+        engine's default behavior will be used.
+
+        .. versionadded 0.24.0
     kwargs
         Additional keyword arguments passed to the engine
     """
     impl = get_engine(engine)
-    return impl.write(df, path, compression=compression, **kwargs)
+    return impl.write(df, path, compression=compression, index=index, **kwargs)
 
 
 def read_parquet(path, engine='auto', columns=None, **kwargs):
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -368,6 +368,40 @@ def test_multiindex_with_columns(self, pa_ge_070):
             check_round_trip(df, engine, read_kwargs={'columns': ['A', 'B']},
                              expected=df[['A', 'B']])
 
+    def test_write_ignoring_index(self, engine):
+        # ENH 20768
+        # Ensure index=False omits the index from the written Parquet file.
+        df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']})
+
+        write_kwargs = {
+            'compression': None,
+            'index': False,
+        }
+
+        # Because we're dropping the index, we expect the loaded dataframe to
+        # have the default integer index.
+        expected = df.reset_index(drop=True)
+
+        check_round_trip(df, engine, write_kwargs=write_kwargs,
+                         expected=expected)
+
+        # Ignore custom index
+        df = pd.DataFrame({'a': [1, 2, 3], 'b': ['q', 'r', 's']},
+                          index=['zyx', 'wvu', 'tsr'])
+
+        check_round_trip(df, engine, write_kwargs=write_kwargs,
+                         expected=expected)
+
+        # Ignore multi-indexes as well.
+        arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
+                  ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
+        df = pd.DataFrame({'one': [i for i in range(8)],
+                           'two': [-i for i in range(8)]}, index=arrays)
+
+        expected = df.reset_index(drop=True)
+        check_round_trip(df, engine, write_kwargs=write_kwargs,
+                         expected=expected)
+
 
 class TestParquetPyArrow(Base):