ENH: add to/from_parquet with pyarrow & fastparquet

jreback · jreback · commit e45dfe86e4e3 · 2017-07-26T19:22:19.000-04:00
diff --git a/ci/install_travis.sh b/ci/install_travis.sh
@@ -156,6 +156,7 @@ fi
 echo
 echo "[removing installed pandas]"
 conda remove pandas -y --force
+pip uninstall -y pandas
 
 if [ "$BUILD_TEST" ]; then
 
diff --git a/ci/requirements-2.7.sh b/ci/requirements-2.7.sh
@@ -4,4 +4,4 @@ source activate pandas
 
 echo "install 27"
 
-conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1
+conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1 fastparquet
diff --git a/ci/requirements-3.5.sh b/ci/requirements-3.5.sh
@@ -4,8 +4,8 @@ source activate pandas
 
 echo "install 35"
 
-conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1
-
 # pip install python-dateutil to get latest
 conda remove -n pandas python-dateutil --force
 pip install python-dateutil
+
+conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1
diff --git a/ci/requirements-3.5_OSX.sh b/ci/requirements-3.5_OSX.sh
@@ -4,4 +4,4 @@ source activate pandas
 
 echo "install 35_OSX"
 
-conda install -n pandas -c conda-forge feather-format==0.3.1
+conda install -n pandas -c conda-forge feather-format==0.3.1 fastparquet
diff --git a/ci/requirements-3.6.pip b/ci/requirements-3.6.pip
@@ -0,0 +1 @@
+brotlipy
diff --git a/ci/requirements-3.6.run b/ci/requirements-3.6.run
@@ -16,6 +16,8 @@ sqlalchemy
 pymysql
 feather-format
 pyarrow
+python-snappy
+fastparquet
 # psycopg2 (not avail on defaults ATM)
 beautifulsoup4
 s3fs
diff --git a/ci/requirements-3.6_WIN.run b/ci/requirements-3.6_WIN.run
@@ -13,3 +13,5 @@ numexpr
 pytables
 matplotlib
 blosc
+fastparquet
+pyarrow
diff --git a/doc/source/install.rst b/doc/source/install.rst
@@ -236,6 +236,7 @@ Optional Dependencies
 * `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
 * `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended.
 * `Feather Format <https://github.com/wesm/feather>`__: necessary for feather-based storage, version 0.3.1 or higher.
+* ``Apache Parquet Format``, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1) or `fastparquet <https://fastparquet.readthedocs.io/en/latest/necessary>`__ (>= 0.0.6) for parquet-based storage. The `snappy <https://pypi.python.org/pypi/python-snappy>`__ and `brotli <https://pypi.python.org/pypi/brotlipy>`__ are available for compression support.
 * `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are:
 
   * `psycopg2 <http://initd.org/psycopg/>`__: for PostgreSQL
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -43,6 +43,7 @@ object. The corresponding ``writer`` functions are object methods that are acces
     binary;`MS Excel <https://en.wikipedia.org/wiki/Microsoft_Excel>`__;:ref:`read_excel<io.excel_reader>`;:ref:`to_excel<io.excel_writer>`
     binary;`HDF5 Format <https://support.hdfgroup.org/HDF5/whatishdf5.html>`__;:ref:`read_hdf<io.hdf5>`;:ref:`to_hdf<io.hdf5>`
     binary;`Feather Format <https://github.com/wesm/feather>`__;:ref:`read_feather<io.feather>`;:ref:`to_feather<io.feather>`
+    binary;`Parquet Format <https://parquet.apache.org/>`__;:ref:`read_parquet<io.parquet>`;:ref:`to_parquet<io.parquet>`
     binary;`Msgpack <http://msgpack.org/index.html>`__;:ref:`read_msgpack<io.msgpack>`;:ref:`to_msgpack<io.msgpack>`
     binary;`Stata <https://en.wikipedia.org/wiki/Stata>`__;:ref:`read_stata<io.stata_reader>`;:ref:`to_stata<io.stata_writer>`
     binary;`SAS <https://en.wikipedia.org/wiki/SAS_(software)>`__;:ref:`read_sas<io.sas_reader>`;
@@ -4550,6 +4551,76 @@ Read from a feather file.
    import os
    os.remove('example.feather')
 
+
+.. _io.parquet:
+
+Parquet
+-------
+
+.. versionadded:: 0.21.0
+
+Parquet provides a sharded binary columnar serialization for data frames. It is designed to make reading and writing data
+frames efficient, and to make sharing data across data analysis languages easy. Parquet can use a
+variety of compression techniques to shrink the file size as much as possible while still maintaining good read performance.
+
+Parquet is designed to faithfully serialize and de-serialize DataFrames, supporting all of the pandas
+dtypes, including extension dtypes such as categorical and datetime with tz.
+
+Several caveats.
+
+- The format will NOT write an ``Index``, or ``MultiIndex`` for the ``DataFrame`` and will raise an
+  error if a non-default one is provided. You can simply ``.reset_index()`` in order to store the index.
+- Duplicate column names and non-string columns names are not supported
+- Non supported types include ``Period`` and actual python object types. These will raise a helpful error message
+  on an attempt at serialization.
+
+See the documentation for `pyarrow <http://arrow.apache.org/docs/python/`__ and `fastparquet <https://fastparquet.readthedocs.io/en/latest/>`__
+
+.. note::
+
+   These engines are very similar and should read/write nearly identical parquet format files.
+   These libraries differ by having different underlying dependencies (``fastparquet`` by using ``numba``, while ``pyarrow`` uses a c-library).
+   TODO: differing options to write non-standard columns & null treatment
+
+.. ipython:: python
+
+   df = pd.DataFrame({'a': list('abc'),
+                      'b': list(range(1, 4)),
+                      'c': np.arange(3, 6).astype('u1'),
+                      'd': np.arange(4.0, 7.0, dtype='float64'),
+                      'e': [True, False, True],
+                      'f': pd.Categorical(list('abc')),
+                      'g': pd.date_range('20130101', periods=3),
+                      'h': pd.date_range('20130101', periods=3, tz='US/Eastern'),
+                      'i': pd.date_range('20130101', periods=3, freq='ns')})
+
+   df
+   df.dtypes
+
+Write to a parquet file.
+
+.. ipython:: python
+
+   df.to_parquet('example_pa.parquet', engine='pyarrow')
+   df.to_parquet('example_fp.parquet', engine='fastparquet')
+
+Read from a parquet file.
+
+.. ipython:: python
+
+   result = pd.read_parquet('example_pa.parquet')
+   result = pd.read_parquet('example_fp.parquet')
+
+   # we preserve dtypes
+   result.dtypes
+
+.. ipython:: python
+   :suppress:
+
+   import os
+   os.remove('example_pa.parquet')
+   os.remove('example_fp.parquet')
+
 .. _io.sql:
 
 SQL Queries
diff --git a/doc/source/options.rst b/doc/source/options.rst
@@ -414,6 +414,8 @@ io.hdf.default_format               None         default format writing format,
                                                  'table'
 io.hdf.dropna_table                 True         drop ALL nan rows when appending
                                                  to a table
+io.parquet.engine                   pyarrow      The engine to use as a default for
+                                                 parquet reading and writing.
 mode.chained_assignment             warn         Raise an exception, warn, or no
                                                  action if trying to use chained
                                                  assignment, The default is warn
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -78,6 +78,7 @@ Other Enhancements
 - :func:`DataFrame.select_dtypes` now accepts scalar values for include/exclude as well as list-like. (:issue:`16855`)
 - :func:`date_range` now accepts 'YS' in addition to 'AS' as an alias for start of year (:issue:`9313`)
 - :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`)
+- Integration with Apache Parquet, including a new top-level ``pd.read_parquet()`` and ``DataFrame.to_parquet()`` method, see :ref:`here <io.parquet>`.
 
 .. _whatsnew_0210.api_breaking:
 
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -462,3 +462,14 @@ def _register_xlsx(engine, other):
     except ImportError:
         # fallback
         _register_xlsx('openpyxl', 'xlsxwriter')
+
+# Set up the io.parquet specific configuration.
+parquet_engine_doc = """
+: string
+    The default parquet reader/writer engine. Available options:
+    'pyarrow', 'fastparquet', the default is 'pyarrow'
+"""
+
+with cf.config_prefix('io.parquet'):
+    cf.register_option('engine', 'pyarrow', parquet_engine_doc,
+                       validator=is_one_of_factory(['pyarrow', 'fastparquet']))
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1598,6 +1598,29 @@ def to_feather(self, fname):
         from pandas.io.feather_format import to_feather
         to_feather(self, fname)
 
+    def to_parquet(self, fname, engine=None, compression='snappy',
+                   **kwargs):
+        """
+        write out the binary parquet for DataFrames
+
+        .. versionadded:: 0.21.0
+
+        Parameters
+        ----------
+        fname : str
+            string file path
+        engine : str, optional
+            The parquet engine, one of {'pyarrow', 'fastparquet'}
+            if None, will use the option: `io.parquet.engine`
+        compression : str, optional, default 'snappy'
+            compression method, includes {'gzip', 'snappy', 'brotli'}
+        kwargs passed to the engine
+
+        """
+        from pandas.io.parquet import to_parquet
+        to_parquet(self, fname, engine,
+                   compression=compression, **kwargs)
+
     @Substitution(header='Write out column names. If a list of string is given, \
 it is assumed to be aliases for the column names')
     @Appender(fmt.docstring_to_string, indents=1)
diff --git a/pandas/io/api.py b/pandas/io/api.py
@@ -13,6 +13,7 @@
 from pandas.io.sql import read_sql, read_sql_table, read_sql_query
 from pandas.io.sas import read_sas
 from pandas.io.feather_format import read_feather
+from pandas.io.parquet import read_parquet
 from pandas.io.stata import read_stata
 from pandas.io.pickle import read_pickle, to_pickle
 from pandas.io.packers import read_msgpack, to_msgpack
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -0,0 +1,179 @@
+""" parquet compat """
+
+from warnings import catch_warnings
+from distutils.version import LooseVersion
+from pandas import DataFrame, RangeIndex, Int64Index, get_option
+from pandas.compat import range
+from pandas.io.common import get_filepath_or_buffer
+
+
+def get_engine(engine):
+    """ return our implementation """
+
+    if engine is None:
+        engine = get_option('io.parquet.engine')
+
+    if engine not in ['pyarrow', 'fastparquet']:
+        raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
+
+    if engine == 'pyarrow':
+        return PyArrowImpl()
+    elif engine == 'fastparquet':
+        return FastParquetImpl()
+
+
+class PyArrowImpl(object):
+
+    def __init__(self):
+        # since pandas is a dependency of pyarrow
+        # we need to import on first use
+
+        try:
+            import pyarrow
+            import pyarrow.parquet
+        except ImportError:
+            raise ImportError("pyarrow is required for parquet support\n\n"
+                              "you can install via conda\n"
+                              "conda install pyarrow -c conda-forge\n"
+                              "\nor via pip\n"
+                              "pip install pyarrow\n")
+
+        if LooseVersion(pyarrow.__version__) < '0.4.1':
+            raise ImportError("pyarrow >= 0.4.1 is required for parquet"
+                              "support\n\n"
+                              "you can install via conda\n"
+                              "conda install pyarrow -c conda-forge\n"
+                              "\nor via pip\n"
+                              "pip install pyarrow\n")
+
+        self.api = pyarrow
+
+    def write(self, df, path, compression='snappy', **kwargs):
+        path, _, _ = get_filepath_or_buffer(path)
+        table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
+        self.api.parquet.write_table(
+            table, path, compression=compression, **kwargs)
+
+    def read(self, path):
+        path, _, _ = get_filepath_or_buffer(path)
+        return self.api.parquet.read_table(path).to_pandas()
+
+
+class FastParquetImpl(object):
+
+    def __init__(self):
+        # since pandas is a dependency of fastparquet
+        # we need to import on first use
+
+        try:
+            import fastparquet
+        except ImportError:
+            raise ImportError("fastparquet is required for parquet support\n\n"
+                              "you can install via conda\n"
+                              "conda install fastparquet -c conda-forge\n"
+                              "\nor via pip\n"
+                              "pip install fastparquet")
+
+        if LooseVersion(fastparquet.__version__) < '0.1.0':
+            raise ImportError("fastparquet >= 0.1.0 is required for parquet "
+                              "support\n\n"
+                              "you can install via conda\n"
+                              "conda install fastparquet -c conda-forge\n"
+                              "\nor via pip\n"
+                              "pip install fastparquet")
+
+        self.api = fastparquet
+
+    def write(self, df, path, compression='snappy', **kwargs):
+        # thriftpy/protocol/compact.py:339:
+        # DeprecationWarning: tostring() is deprecated.
+        # Use tobytes() instead.
+        path, _, _ = get_filepath_or_buffer(path)
+        with catch_warnings(record=True):
+            self.api.write(path, df,
+                           compression=compression, **kwargs)
+
+    def read(self, path):
+        path, _, _ = get_filepath_or_buffer(path)
+        return self.api.ParquetFile(path).to_pandas()
+
+
+def to_parquet(df, path, engine=None, compression='snappy', **kwargs):
+    """
+    Write a DataFrame to the parquet format.
+
+    Parameters
+    ----------
+    df : DataFrame
+    path : string
+        File path
+    engine : str, optional
+        The parquet engine, one of {'pyarrow', 'fastparquet'}
+        if None, will use the option: `io.parquet.engine`
+    compression : str, optional, default 'snappy'
+        compression method, includes {'gzip', 'snappy', 'brotli'}
+    kwargs are passed to the engine
+    """
+
+    impl = get_engine(engine)
+
+    if not isinstance(df, DataFrame):
+        raise ValueError("to_parquet only support IO with DataFrames")
+
+    valid_types = {'string', 'unicode'}
+
+    # validate index
+    # --------------
+
+    # validate that we have only a default index
+    # raise on anything else as we don't serialize the index
+
+    if not isinstance(df.index, Int64Index):
+        raise ValueError("parquet does not support serializing {} "
+                         "for the index; you can .reset_index()"
+                         "to make the index into column(s)".format(
+                             type(df.index)))
+
+    if not df.index.equals(RangeIndex.from_range(range(len(df)))):
+        raise ValueError("parquet does not support serializing a "
+                         "non-default index for the index; you "
+                         "can .reset_index() to make the index "
+                         "into column(s)")
+
+    if df.index.name is not None:
+        raise ValueError("parquet does not serialize index meta-data on a "
+                         "default index")
+
+    # validate columns
+    # ----------------
+
+    # must have value column names (strings only)
+    if df.columns.inferred_type not in valid_types:
+        raise ValueError("parquet must have string column names")
+
+    return impl.write(df, path, compression=compression)
+
+
+def read_parquet(path, engine=None, **kwargs):
+    """
+    Load a parquet object from the file path, returning a DataFrame.
+
+    .. versionadded 0.21.0
+
+    Parameters
+    ----------
+    path : string
+        File path
+    engine : str, optional
+        The parquet engine, one of {'pyarrow', 'fastparquet'}
+        if None, will use the option: `io.parquet.engine`
+    kwargs are passed to the engine
+
+    Returns
+    -------
+    DataFrame
+
+    """
+
+    impl = get_engine(engine)
+    return impl.read(path)
diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
@@ -82,7 +82,7 @@ class TestPDApi(Base):
                   'read_gbq', 'read_hdf', 'read_html', 'read_json',
                   'read_msgpack', 'read_pickle', 'read_sas', 'read_sql',
                   'read_sql_query', 'read_sql_table', 'read_stata',
-                  'read_table', 'read_feather']
+                  'read_table', 'read_feather', 'read_parquet']
 
     # top-level to_* funcs
     funcs_to = ['to_datetime', 'to_msgpack',
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py

Original file line number	Diff line number	Diff line change
`@@ -4,4 +4,4 @@ source activate pandas`
`4`	`4`
`5`	`5`	`echo "install 27"`
`6`	`6`
`7`		`-conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1`
	`7`	`+conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1 fastparquet`
Original file line number	Diff line number	Diff line change
`@@ -4,4 +4,4 @@ source activate pandas`
`4`	`4`
`5`	`5`	`echo "install 35_OSX"`
`6`	`6`
`7`		`-conda install -n pandas -c conda-forge feather-format==0.3.1`
	`7`	`+conda install -n pandas -c conda-forge feather-format==0.3.1 fastparquet`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,326 @@ @@
 +""" test parquet compat """
++
 +import pytest
 +import datetime
 +from warnings import catch_warnings
++
 +import numpy as np
 +import pandas as pd
 +from pandas.compat import PY3, is_platform_windows
 +from pandas.io.parquet import to_parquet, read_parquet
 +from pandas.util import testing as tm
++
 +try:
 +    import pyarrow  # noqa
 +    _HAVE_PYARROW = True
 +except ImportError:
 +    _HAVE_PYARROW = False
++
 +try:
 +    import fastparquet  # noqa
 +    _HAVE_FASTPARQUET = True
 +except ImportError:
 +    _HAVE_FASTPARQUET = False
++
++
 +# setup engines & skips
 +@pytest.fixture(params=[
 +    pytest.mark.skipif(not _HAVE_FASTPARQUET,
 +                       reason='fastparquet is not installed')('fastparquet'),
 +    pytest.mark.skipif(not _HAVE_PYARROW,
 +                       reason='pyarrow is not installed')('pyarrow')])
 +def engine(request):
 +    return request.param
++
++
 +@pytest.fixture
 +def pa():
 +    if not _HAVE_PYARROW:
 +        pytest.skip("pyarrow is not installed")
 +    if is_platform_windows():
 +        pytest.skip("pyarrow-parquet not building on windows")
 +    return 'pyarrow'
++
++
 +@pytest.fixture
 +def fp():
 +    if not _HAVE_FASTPARQUET:
 +        pytest.skip("fastparquet is not installed")
 +    return 'fastparquet'
++
++
 +@pytest.fixture
 +def df_compat():
 +    return pd.DataFrame({'A': [1, 2, 3], 'B': 'foo'})
++
++
 +def test_invalid_engine(df_compat):
++
 +    with pytest.raises(ValueError):
 +        df_compat.to_parquet('foo', 'bar')
++
++
 +def test_options_py(df_compat, pa):
 +    # use the set option
++
 +    df = df_compat
 +    with tm.ensure_clean() as path:
++
 +        with pd.option_context('io.parquet.engine', 'pyarrow'):
 +            df.to_parquet(path)
++
 +            result = read_parquet(path, compression=None)
 +            tm.assert_frame_equal(result, df)
++
++
 +def test_options_fp(df_compat, fp):
 +    # use the set option
++
 +    df = df_compat
 +    with tm.ensure_clean() as path:
++
 +        with pd.option_context('io.parquet.engine', 'fastparquet'):
 +            df.to_parquet(path, compression=None)
++
 +            result = read_parquet(path, compression=None)
 +            tm.assert_frame_equal(result, df)
++
++
 +@pytest.mark.xfail(reason="fp does not ignore pa index __index_level_0__")
 +def test_cross_engine_pa_fp(df_compat, pa, fp):
 +    # cross-compat with differing reading/writing engines
++
 +    df = df_compat
 +    with tm.ensure_clean() as path:
 +        df.to_parquet(path, engine=pa, compression=None)
++
 +        result = read_parquet(path, engine=fp, compression=None)
 +        tm.assert_frame_equal(result, df)
++
++
 +def test_cross_engine_fp_pa(df_compat, pa, fp):
 +    # cross-compat with differing reading/writing engines
++
 +    df = df_compat
 +    with tm.ensure_clean() as path:
 +        df.to_parquet(path, engine=fp, compression=None)
++
 +        result = read_parquet(path, engine=pa, compression=None)
 +        tm.assert_frame_equal(result, df)
++
++
 +class Base(object):
++
 +    def check_error_on_write(self, df, engine, exc):
 +        # check that we are raising the exception
 +        # on writing
++
 +        with pytest.raises(exc):
 +            with tm.ensure_clean() as path:
 +                to_parquet(df, path, engine, compression=None)
++
 +    def check_round_trip(self, df, engine, expected=None, **kwargs):
++
 +        with tm.ensure_clean() as path:
 +            df.to_parquet(path, engine, **kwargs)
 +            result = read_parquet(path, engine)
++
 +            if expected is None:
 +                expected = df
 +            tm.assert_frame_equal(result, expected)
++
 +            # repeat
 +            to_parquet(df, path, engine, **kwargs)
 +            result = pd.read_parquet(path, engine)
++
 +            if expected is None:
 +                expected = df
 +            tm.assert_frame_equal(result, expected)
++
++
 +class TestBasic(Base):
++
 +    def test_error(self, engine):
++
 +        for obj in [pd.Series([1, 2, 3]), 1, 'foo', pd.Timestamp('20130101'),
 +                    np.array([1, 2, 3])]:
 +            self.check_error_on_write(obj, engine, ValueError)
++
 +    def test_columns_dtypes(self, engine):
++
 +        df = pd.DataFrame({'string': list('abc'),
 +                           'int': list(range(1, 4))})
++
 +        # unicode
 +        df.columns = [u'foo', u'bar']
 +        self.check_round_trip(df, engine, compression=None)
++
 +    def test_columns_dtypes_invalid(self, engine):
++
 +        df = pd.DataFrame({'string': list('abc'),
 +                           'int': list(range(1, 4))})
++
 +        # numeric
 +        df.columns = [0, 1]
 +        self.check_error_on_write(df, engine, ValueError)
++
 +        if PY3:
 +            # bytes on PY3, on PY2 these are str
 +            df.columns = [b'foo', b'bar']
 +            self.check_error_on_write(df, engine, ValueError)
++
 +        # python object
 +        df.columns = [datetime.datetime(2011, 1, 1, 0, 0),
 +                      datetime.datetime(2011, 1, 1, 1, 1)]
 +        self.check_error_on_write(df, engine, ValueError)
++
 +    def test_write_with_index(self, engine):
++
 +        df = pd.DataFrame({'A': [1, 2, 3]})
 +        self.check_round_trip(df, engine, compression=None)
++
 +        # non-default index
 +        for index in [[2, 3, 4],
 +                      pd.date_range('20130101', periods=3),
 +                      list('abc'),
 +                      [1, 3, 4],
 +                      pd.MultiIndex.from_tuples([('a', 1), ('a', 2),
 +                                                 ('b', 1)]),
 +                      ]:
++
 +            df.index = index
 +            self.check_error_on_write(df, engine, ValueError)
++
 +        # index with meta-data
 +        df.index = [0, 1, 2]
 +        df.index.name = 'foo'
 +        self.check_error_on_write(df, engine, ValueError)
++
 +        # column multi-index
 +        df.index = [0, 1, 2]
 +        df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]),
 +        self.check_error_on_write(df, engine, ValueError)
++
 +    @pytest.mark.parametrize('compression', [None, 'gzip', 'snappy', 'brotli'])
 +    def test_compression(self, engine, compression):
++
 +        if compression == 'snappy':
 +            pytest.importorskip('snappy')
++
 +        elif compression == 'brotli':
 +            pytest.importorskip('brotli')
++
 +        df = pd.DataFrame({'A': [1, 2, 3]})
 +        self.check_round_trip(df, engine, compression=compression)
++
++
 +class TestParquetPyArrow(Base):
++
 +    def test_basic(self, pa):
++
 +        df = pd.DataFrame({'string': list('abc'),
 +                           'string_with_nan': ['a', np.nan, 'c'],
 +                           'string_with_none': ['a', None, 'c'],
 +                           'bytes': [b'foo', b'bar', b'baz'],
 +                           'unicode': [u'foo', u'bar', u'baz'],
 +                           'int': list(range(1, 4)),
 +                           'uint': np.arange(3, 6).astype('u1'),
 +                           'float': np.arange(4.0, 7.0, dtype='float64'),
 +                           'float_with_nan': [2., np.nan, 3.],
 +                           'bool': [True, False, True],
 +                           'bool_with_none': [True, None, True],
 +                           'datetime_ns': pd.date_range('20130101', periods=3),
 +                           'datetime_with_nat': [pd.Timestamp('20130101'),
 +                                                 pd.NaT,
 +                                                 pd.Timestamp('20130103')]
 +                           })
++
 +        self.check_round_trip(df, pa)
++
 +    def test_duplicate_columns(self, pa):
++
 +        # not currently able to handle duplicate columns
 +        df = pd.DataFrame(np.arange(12).reshape(4, 3),
 +                          columns=list('aaa')).copy()
 +        self.check_error_on_write(df, pa, ValueError)
++
 +    def test_unsupported(self, pa):
++
 +        # period
 +        df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
 +        self.check_error_on_write(df, pa, ValueError)
++
 +        # categorical
 +        df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
 +        self.check_error_on_write(df, pa, NotImplementedError)
++
 +        # timedelta
 +        df = pd.DataFrame({'a': pd.timedelta_range('1 day',
 +                                                   periods=3)})
 +        self.check_error_on_write(df, pa, NotImplementedError)
++
 +        # mixed python objects
 +        df = pd.DataFrame({'a': ['a', 1, 2.0]})
 +        self.check_error_on_write(df, pa, ValueError)
++
++
 +class TestParquetFastParquet(Base):
++
 +    def test_basic(self, fp):
++
 +        df = pd.DataFrame(
 +            {'string': list('abc'),
 +             'string_with_nan': ['a', np.nan, 'c'],
 +             'string_with_none': ['a', None, 'c'],
 +             'bytes': [b'foo', b'bar', b'baz'],
 +             'unicode': [u'foo', u'bar', u'baz'],
 +             'int': list(range(1, 4)),
 +             'uint': np.arange(3, 6).astype('u1'),
 +             'float': np.arange(4.0, 7.0, dtype='float64'),
 +             'float_with_nan': [2., np.nan, 3.],
 +             'bool': [True, False, True],
 +             'datetime': pd.date_range('20130101', periods=3),
 +             'datetime_with_nat': [pd.Timestamp('20130101'),
 +                                   pd.NaT,
 +                                   pd.Timestamp('20130103')],
 +             'timedelta': pd.timedelta_range('1 day', periods=3),
 +             })
++
 +        self.check_round_trip(df, fp, compression=None)
++
 +    @pytest.mark.skip(reason="not supported")
 +    def test_duplicate_columns(self, fp):
++
 +        # not currently able to handle duplicate columns
 +        df = pd.DataFrame(np.arange(12).reshape(4, 3),
 +                          columns=list('aaa')).copy()
 +        self.check_error_on_write(df, fp, ValueError)
++
 +    def test_bool_with_none(self, fp):
 +        df = pd.DataFrame({'a': [True, None, False]})
 +        expected = pd.DataFrame({'a': [1.0, np.nan, 0.0]}, dtype='float16')
 +        self.check_round_trip(df, fp, expected=expected, compression=None)
++
 +    def test_unsupported(self, fp):
++
 +        # period
 +        df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
 +        self.check_error_on_write(df, fp, ValueError)
++
 +        # mixed
 +        df = pd.DataFrame({'a': ['a', 1, 2.0]})
 +        self.check_error_on_write(df, fp, ValueError)
++
 +    def test_categorical(self, fp):
 +        df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
 +        self.check_round_trip(df, fp, compression=None)
++
 +    def test_datetime_tz(self, fp):
 +        # doesn't preserve tz
 +        df = pd.DataFrame({'a': pd.date_range('20130101', periods=3,
 +                                              tz='US/Eastern')})
++
 +        # warns on the coercion
 +        with catch_warnings(record=True):
 +            self.check_round_trip(df, fp, df.astype('datetime64[ns]'),
 +                                  compression=None)
Original file line number	Diff line number	Diff line change
`@@ -94,6 +94,7 @@ def show_versions(as_json=False):`
`94`	`94`	`("psycopg2", lambda mod: mod.__version__),`
`95`	`95`	`("jinja2", lambda mod: mod.__version__),`
`96`	`96`	`("s3fs", lambda mod: mod.__version__),`
	`97`	`+ ("fastparquet", lambda mod: mod.__version__),`
`97`	`98`	`("pandas_gbq", lambda mod: mod.__version__),`
`98`	`99`	`("pandas_datareader", lambda mod: mod.__version__),`
`99`	`100`	`]`