From 5a87d7e66fa8a40140e750691a0189f64e198088 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 6 Dec 2017 15:56:43 +0100
Subject: [PATCH 1/6] TST: add parquet test with tz datetime data for pyarrow

+ clean-up basic data types tests: make common dataframe with types
supported by both pyarrow and fastparquet
---
 pandas/tests/io/test_parquet.py | 80 ++++++++++++++++++---------------
 1 file changed, 44 insertions(+), 36 deletions(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index e7bcff22371b7..4520e95f563b9 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -90,6 +90,25 @@ def df_cross_compat():
     return df
 
 
+@pytest.fixture
+def df_full():
+    return pd.DataFrame(
+        {'string': list('abc'),
+         'string_with_nan': ['a', np.nan, 'c'],
+         'string_with_none': ['a', None, 'c'],
+         'bytes': [b'foo', b'bar', b'baz'],
+         'unicode': [u'foo', u'bar', u'baz'],
+         'int': list(range(1, 4)),
+         'uint': np.arange(3, 6).astype('u1'),
+         'float': np.arange(4.0, 7.0, dtype='float64'),
+         'float_with_nan': [2., np.nan, 3.],
+         'bool': [True, False, True],
+         'datetime': pd.date_range('20130101', periods=3),
+         'datetime_with_nat': [pd.Timestamp('20130101'),
+                               pd.NaT,
+                               pd.Timestamp('20130103')]})
+
+
 def test_invalid_engine(df_compat):
 
     with pytest.raises(ValueError):
@@ -300,27 +319,29 @@ def test_read_columns(self, engine):
 
 class TestParquetPyArrow(Base):
 
-    def test_basic(self, pa):
+    def test_basic(self, pa, df_full):
 
-        df = pd.DataFrame({'string': list('abc'),
-                           'string_with_nan': ['a', np.nan, 'c'],
-                           'string_with_none': ['a', None, 'c'],
-                           'bytes': [b'foo', b'bar', b'baz'],
-                           'unicode': [u'foo', u'bar', u'baz'],
-                           'int': list(range(1, 4)),
-                           'uint': np.arange(3, 6).astype('u1'),
-                           'float': np.arange(4.0, 7.0, dtype='float64'),
-                           'float_with_nan': [2., np.nan, 3.],
-                           'bool': [True, False, True],
-                           'bool_with_none': [True, None, True],
-                           'datetime_ns': pd.date_range('20130101', periods=3),
-                           'datetime_with_nat': [pd.Timestamp('20130101'),
-                                                 pd.NaT,
-                                                 pd.Timestamp('20130103')]
-                           })
+        df = df_full
+
+        # additional supported types for pyarrow
+        df['datetime_tz'] = pd.date_range('20130101', periods=3,
+                                          tz='Europe/Brussels')
+        df['bool_with_none'] = [True, None, True]
 
         self.check_round_trip(df, pa)
 
+    @pytest.mark.xfail(reason="pyarrow fails on this (ARROW-1883)")
+    def test_basic_subset_columns(self, pa, df_full):
+        # GH18628
+
+        df = df_full
+        # additional supported types for pyarrow
+        df['datetime_tz'] = pd.date_range('20130101', periods=3,
+                                          tz='Europe/Brussels')
+
+        self.check_round_trip(df, pa, expected=df[['string', 'int']],
+                              read_kwargs={'columns': ['string', 'int']})
+
     def test_duplicate_columns(self, pa):
 
         # not currently able to handle duplicate columns
@@ -363,25 +384,12 @@ def test_categorical_unsupported(self, pa_lt_070):
 
 class TestParquetFastParquet(Base):
 
-    def test_basic(self, fp):
-
-        df = pd.DataFrame(
-            {'string': list('abc'),
-             'string_with_nan': ['a', np.nan, 'c'],
-             'string_with_none': ['a', None, 'c'],
-             'bytes': [b'foo', b'bar', b'baz'],
-             'unicode': [u'foo', u'bar', u'baz'],
-             'int': list(range(1, 4)),
-             'uint': np.arange(3, 6).astype('u1'),
-             'float': np.arange(4.0, 7.0, dtype='float64'),
-             'float_with_nan': [2., np.nan, 3.],
-             'bool': [True, False, True],
-             'datetime': pd.date_range('20130101', periods=3),
-             'datetime_with_nat': [pd.Timestamp('20130101'),
-                                   pd.NaT,
-                                   pd.Timestamp('20130103')],
-             'timedelta': pd.timedelta_range('1 day', periods=3),
-             })
+    def test_basic(self, fp, df_full):
+
+        df = df_full
+
+        # additional supported types for fastparquet
+        df['timedelta'] = pd.timedelta_range('1 day', periods=3)
 
         self.check_round_trip(df, fp, write_kwargs={'compression': None})
 

From 2bc35cfbf3d16f977f24c5a5da6894758e0a70b2 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 6 Dec 2017 15:59:02 +0100
Subject: [PATCH 2/6] DOC: document differences between pyarrow and fastparquet
 in supported data types

---
 doc/source/io.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/source/io.rst b/doc/source/io.rst
index f96e33dbf9882..49e264c8562d0 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -4522,6 +4522,7 @@ See the documentation for `pyarrow <http://arrow.apache.org/docs/python/>`__ and
 .. note::
 
    These engines are very similar and should read/write nearly identical parquet format files.
+   Currently ``pyarrow`` does not support timedelta data, and ``fastparquet`` does not support timezone aware datetimes (they are coerced to UTC).
    These libraries differ by having different underlying dependencies (``fastparquet`` by using ``numba``, while ``pyarrow`` uses a c-library).
 
 .. ipython:: python
@@ -4548,8 +4549,8 @@ Read from a parquet file.
 
 .. ipython:: python
 
-   result = pd.read_parquet('example_pa.parquet', engine='pyarrow')
    result = pd.read_parquet('example_fp.parquet', engine='fastparquet')
+   result = pd.read_parquet('example_pa.parquet', engine='pyarrow')
 
    result.dtypes
 

From c200885a10015feeffe500bcca0faf941f463ddf Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 6 Dec 2017 15:59:58 +0100
Subject: [PATCH 3/6] TST: enable pyarrow/fastparquet cross compatibility tests
 on smaller subset of dataframe

---
 pandas/tests/io/test_parquet.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 4520e95f563b9..84c3af06d6d35 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -80,13 +80,14 @@ def df_compat():
 def df_cross_compat():
     df = pd.DataFrame({'a': list('abc'),
                        'b': list(range(1, 4)),
-                       'c': np.arange(3, 6).astype('u1'),
+                       # 'c': np.arange(3, 6).astype('u1'),
                        'd': np.arange(4.0, 7.0, dtype='float64'),
                        'e': [True, False, True],
                        'f': pd.date_range('20130101', periods=3),
-                       'g': pd.date_range('20130101', periods=3,
-                                          tz='US/Eastern'),
-                       'h': pd.date_range('20130101', periods=3, freq='ns')})
+                       # 'g': pd.date_range('20130101', periods=3,
+                       #                    tz='US/Eastern'),
+                       # 'h': pd.date_range('20130101', periods=3, freq='ns')
+                       })
     return df
 
 
@@ -173,7 +174,6 @@ def test_options_get_engine(fp, pa):
         assert isinstance(get_engine('fastparquet'), FastParquetImpl)
 
 
-@pytest.mark.xfail(reason="fp does not ignore pa index __index_level_0__")
 def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
     # cross-compat with differing reading/writing engines
 
@@ -185,7 +185,6 @@ def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
         tm.assert_frame_equal(result, df)
 
 
-@pytest.mark.xfail(reason="pyarrow reading fp in some cases")
 def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
     # cross-compat with differing reading/writing engines
 

From 55ce01c39fee375c7c4592c1a8eaf83e416140f9 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 6 Dec 2017 17:27:30 +0100
Subject: [PATCH 4/6] only test datetime tz for pyarrow >= 0.7

---
 pandas/tests/io/test_parquet.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 84c3af06d6d35..20a6d08948e9c 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -323,8 +323,10 @@ def test_basic(self, pa, df_full):
         df = df_full
 
         # additional supported types for pyarrow
-        df['datetime_tz'] = pd.date_range('20130101', periods=3,
-                                          tz='Europe/Brussels')
+        import pyarrow
+        if LooseVersion(pyarrow.__version__) >= LooseVersion('0.7.0'):
+            df['datetime_tz'] = pd.date_range('20130101', periods=3,
+                                              tz='Europe/Brussels')
         df['bool_with_none'] = [True, None, True]
 
         self.check_round_trip(df, pa)

From b05ae5d7a015539401f15e367b96baf8b83d3199 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 7 Dec 2017 14:29:56 +0100
Subject: [PATCH 5/6] skip pa -> fp cross-compat on windows

---
 pandas/tests/io/test_parquet.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 20a6d08948e9c..5b08de75c64bb 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 import pandas as pd
-from pandas.compat import PY3
+from pandas.compat import PY3, is_platform_windows
 from pandas.io.parquet import (to_parquet, read_parquet, get_engine,
                                PyArrowImpl, FastParquetImpl)
 from pandas.util import testing as tm
@@ -174,6 +174,8 @@ def test_options_get_engine(fp, pa):
         assert isinstance(get_engine('fastparquet'), FastParquetImpl)
 
 
+@pytest.mark.xfail(is_platform_windows(),
+                   reason="reading pa metadata failing on Windows")
 def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
     # cross-compat with differing reading/writing engines
 

From 6161f656f6cf27441bec3bd6da6ed0c0f15d37fa Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 8 Dec 2017 09:46:58 +0100
Subject: [PATCH 6/6] Add columns= case in cross-compat tests

---
 pandas/tests/io/test_parquet.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 5b08de75c64bb..c743c5d9fecd5 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -186,6 +186,9 @@ def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
         result = read_parquet(path, engine=fp)
         tm.assert_frame_equal(result, df)
 
+        result = read_parquet(path, engine=fp, columns=['a', 'd'])
+        tm.assert_frame_equal(result, df[['a', 'd']])
+
 
 def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
     # cross-compat with differing reading/writing engines
@@ -197,6 +200,9 @@ def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
         result = read_parquet(path, engine=pa)
         tm.assert_frame_equal(result, df)
 
+        result = read_parquet(path, engine=pa, columns=['a', 'd'])
+        tm.assert_frame_equal(result, df[['a', 'd']])
+
 
 class Base(object):