TST/DOC: test pyarrow tz data + doc / enable cross compat tests for pyarrow/fastparquet (#18662)

jorisvandenbossche · web-flow · commit 371649b94f5a · 2017-12-10T15:41:14.000+01:00
* TST: add parquet test with tz datetime data for pyarrow

+ clean-up basic data types tests: make common dataframe with types
supported by both pyarrow and fastparquet

* DOC: document differences between pyarrow and fastparquet in supported data types

* TST: enable pyarrow/fastparquet cross compatibility tests on smaller subset of dataframe

* only test datetime tz for pyarrow &gt;= 0.7

* skip pa -&gt; fp cross-compat on windows

* Add columns= case in cross-compat tests
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -4522,6 +4522,7 @@ See the documentation for `pyarrow <http://arrow.apache.org/docs/python/>`__ and
 .. note::
 
    These engines are very similar and should read/write nearly identical parquet format files.
+   Currently ``pyarrow`` does not support timedelta data, and ``fastparquet`` does not support timezone aware datetimes (they are coerced to UTC).
    These libraries differ by having different underlying dependencies (``fastparquet`` by using ``numba``, while ``pyarrow`` uses a c-library).
 
 .. ipython:: python
@@ -4548,8 +4549,8 @@ Read from a parquet file.
 
 .. ipython:: python
 
-   result = pd.read_parquet('example_pa.parquet', engine='pyarrow')
    result = pd.read_parquet('example_fp.parquet', engine='fastparquet')
+   result = pd.read_parquet('example_pa.parquet', engine='pyarrow')
 
    result.dtypes
 
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 import pandas as pd
-from pandas.compat import PY3
+from pandas.compat import PY3, is_platform_windows
 from pandas.io.parquet import (to_parquet, read_parquet, get_engine,
                                PyArrowImpl, FastParquetImpl)
 from pandas.util import testing as tm
@@ -80,16 +80,36 @@ def df_compat():
 def df_cross_compat():
     df = pd.DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
-                       'c': np.arange(3, 6).astype('u1'),
+                       # 'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.date_range('20130101', periods=3),
-                       'g': pd.date_range('20130101', periods=3,
-                                          tz='US/Eastern'),
-                       'h': pd.date_range('20130101', periods=3, freq='ns')})
+                       # 'g': pd.date_range('20130101', periods=3,
+                       #                    tz='US/Eastern'),
+                       # 'h': pd.date_range('20130101', periods=3, freq='ns')
+                       })
     return df
 
 
+@pytest.fixture
+def df_full():
+    return pd.DataFrame(
+        {'string': list('abc'),
+         'string_with_nan': ['a', np.nan, 'c'],
+         'string_with_none': ['a', None, 'c'],
+         'bytes': [b'foo', b'bar', b'baz'],
+         'unicode': [u'foo', u'bar', u'baz'],
+         'int': list(range(1, 4)),
+         'uint': np.arange(3, 6).astype('u1'),
+         'float': np.arange(4.0, 7.0, dtype='float64'),
+         'float_with_nan': [2., np.nan, 3.],
+         'bool': [True, False, True],
+         'datetime': pd.date_range('20130101', periods=3),
+         'datetime_with_nat': [pd.Timestamp('20130101'),
+                               pd.NaT,
+                               pd.Timestamp('20130103')]})
+
+
 def test_invalid_engine(df_compat):
 
     with pytest.raises(ValueError):
@@ -154,7 +174,8 @@ def test_options_get_engine(fp, pa):
         assert isinstance(get_engine('fastparquet'), FastParquetImpl)
 
 
-@pytest.mark.xfail(reason="fp does not ignore pa index __index_level_0__")
+@pytest.mark.xfail(is_platform_windows(),
+                   reason="reading pa metadata failing on Windows")
 def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
     # cross-compat with differing reading/writing engines
 
@@ -165,8 +186,10 @@ def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
         result = read_parquet(path, engine=fp)
         tm.assert_frame_equal(result, df)
 
+        result = read_parquet(path, engine=fp, columns=['a', 'd'])
+        tm.assert_frame_equal(result, df[['a', 'd']])
+
 
-@pytest.mark.xfail(reason="pyarrow reading fp in some cases")
 def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
     # cross-compat with differing reading/writing engines
 
@@ -177,6 +200,9 @@ def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
         result = read_parquet(path, engine=pa)
         tm.assert_frame_equal(result, df)
 
+        result = read_parquet(path, engine=pa, columns=['a', 'd'])
+        tm.assert_frame_equal(result, df[['a', 'd']])
+
 
 class Base(object):
 
@@ -300,27 +326,31 @@ def test_read_columns(self, engine):
 
 class TestParquetPyArrow(Base):
 
-    def test_basic(self, pa):
+    def test_basic(self, pa, df_full):
 
-        df = pd.DataFrame({'string': list('abc'),
-                           'string_with_nan': ['a', np.nan, 'c'],
-                           'string_with_none': ['a', None, 'c'],
-                           'bytes': [b'foo', b'bar', b'baz'],
-                           'unicode': [u'foo', u'bar', u'baz'],
-                           'int': list(range(1, 4)),
-                           'uint': np.arange(3, 6).astype('u1'),
-                           'float': np.arange(4.0, 7.0, dtype='float64'),
-                           'float_with_nan': [2., np.nan, 3.],
-                           'bool': [True, False, True],
-                           'bool_with_none': [True, None, True],
-                           'datetime_ns': pd.date_range('20130101', periods=3),
-                           'datetime_with_nat': [pd.Timestamp('20130101'),
-                                                 pd.NaT,
-                                                 pd.Timestamp('20130103')]
-                           })
+        df = df_full
+
+        # additional supported types for pyarrow
+        import pyarrow
+        if LooseVersion(pyarrow.__version__) >= LooseVersion('0.7.0'):
+            df['datetime_tz'] = pd.date_range('20130101', periods=3,
+                                              tz='Europe/Brussels')
+        df['bool_with_none'] = [True, None, True]
 
         self.check_round_trip(df, pa)
 
+    @pytest.mark.xfail(reason="pyarrow fails on this (ARROW-1883)")
+    def test_basic_subset_columns(self, pa, df_full):
+        # GH18628
+
+        df = df_full
+        # additional supported types for pyarrow
+        df['datetime_tz'] = pd.date_range('20130101', periods=3,
+                                          tz='Europe/Brussels')
+
+        self.check_round_trip(df, pa, expected=df[['string', 'int']],
+                              read_kwargs={'columns': ['string', 'int']})
+
     def test_duplicate_columns(self, pa):
 
         # not currently able to handle duplicate columns
@@ -363,25 +393,12 @@ def test_categorical_unsupported(self, pa_lt_070):
 
 class TestParquetFastParquet(Base):
 
-    def test_basic(self, fp):
-
-        df = pd.DataFrame(
-            {'string': list('abc'),
-             'string_with_nan': ['a', np.nan, 'c'],
-             'string_with_none': ['a', None, 'c'],
-             'bytes': [b'foo', b'bar', b'baz'],
-             'unicode': [u'foo', u'bar', u'baz'],
-             'int': list(range(1, 4)),
-             'uint': np.arange(3, 6).astype('u1'),
-             'float': np.arange(4.0, 7.0, dtype='float64'),
-             'float_with_nan': [2., np.nan, 3.],
-             'bool': [True, False, True],
-             'datetime': pd.date_range('20130101', periods=3),
-             'datetime_with_nat': [pd.Timestamp('20130101'),
-                                   pd.NaT,
-                                   pd.Timestamp('20130103')],
-             'timedelta': pd.timedelta_range('1 day', periods=3),
-             })
+    def test_basic(self, fp, df_full):
+
+        df = df_full
+
+        # additional supported types for fastparquet
+        df['timedelta'] = pd.timedelta_range('1 day', periods=3)
 
         self.check_round_trip(df, fp, write_kwargs={'compression': None})