From 5a87d7e66fa8a40140e750691a0189f64e198088 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 6 Dec 2017 15:56:43 +0100 Subject: [PATCH 1/6] TST: add parquet test with tz datetime data for pyarrow + clean-up basic data types tests: make common dataframe with types supported by both pyarrow and fastparquet --- pandas/tests/io/test_parquet.py | 80 ++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index e7bcff22371b7..4520e95f563b9 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -90,6 +90,25 @@ def df_cross_compat(): return df +@pytest.fixture +def df_full(): + return pd.DataFrame( + {'string': list('abc'), + 'string_with_nan': ['a', np.nan, 'c'], + 'string_with_none': ['a', None, 'c'], + 'bytes': [b'foo', b'bar', b'baz'], + 'unicode': [u'foo', u'bar', u'baz'], + 'int': list(range(1, 4)), + 'uint': np.arange(3, 6).astype('u1'), + 'float': np.arange(4.0, 7.0, dtype='float64'), + 'float_with_nan': [2., np.nan, 3.], + 'bool': [True, False, True], + 'datetime': pd.date_range('20130101', periods=3), + 'datetime_with_nat': [pd.Timestamp('20130101'), + pd.NaT, + pd.Timestamp('20130103')]}) + + def test_invalid_engine(df_compat): with pytest.raises(ValueError): @@ -300,27 +319,29 @@ def test_read_columns(self, engine): class TestParquetPyArrow(Base): - def test_basic(self, pa): + def test_basic(self, pa, df_full): - df = pd.DataFrame({'string': list('abc'), - 'string_with_nan': ['a', np.nan, 'c'], - 'string_with_none': ['a', None, 'c'], - 'bytes': [b'foo', b'bar', b'baz'], - 'unicode': [u'foo', u'bar', u'baz'], - 'int': list(range(1, 4)), - 'uint': np.arange(3, 6).astype('u1'), - 'float': np.arange(4.0, 7.0, dtype='float64'), - 'float_with_nan': [2., np.nan, 3.], - 'bool': [True, False, True], - 'bool_with_none': [True, None, True], - 'datetime_ns': pd.date_range('20130101', periods=3), - 'datetime_with_nat': [pd.Timestamp('20130101'), - pd.NaT, - pd.Timestamp('20130103')] - }) + df = df_full + + # additional supported types for pyarrow + df['datetime_tz'] = pd.date_range('20130101', periods=3, + tz='Europe/Brussels') + df['bool_with_none'] = [True, None, True] self.check_round_trip(df, pa) + @pytest.mark.xfail(reason="pyarrow fails on this (ARROW-1883)") + def test_basic_subset_columns(self, pa, df_full): + # GH18628 + + df = df_full + # additional supported types for pyarrow + df['datetime_tz'] = pd.date_range('20130101', periods=3, + tz='Europe/Brussels') + + self.check_round_trip(df, pa, expected=df[['string', 'int']], + read_kwargs={'columns': ['string', 'int']}) + def test_duplicate_columns(self, pa): # not currently able to handle duplicate columns @@ -363,25 +384,12 @@ def test_categorical_unsupported(self, pa_lt_070): class TestParquetFastParquet(Base): - def test_basic(self, fp): - - df = pd.DataFrame( - {'string': list('abc'), - 'string_with_nan': ['a', np.nan, 'c'], - 'string_with_none': ['a', None, 'c'], - 'bytes': [b'foo', b'bar', b'baz'], - 'unicode': [u'foo', u'bar', u'baz'], - 'int': list(range(1, 4)), - 'uint': np.arange(3, 6).astype('u1'), - 'float': np.arange(4.0, 7.0, dtype='float64'), - 'float_with_nan': [2., np.nan, 3.], - 'bool': [True, False, True], - 'datetime': pd.date_range('20130101', periods=3), - 'datetime_with_nat': [pd.Timestamp('20130101'), - pd.NaT, - pd.Timestamp('20130103')], - 'timedelta': pd.timedelta_range('1 day', periods=3), - }) + def test_basic(self, fp, df_full): + + df = df_full + + # additional supported types for fastparquet + df['timedelta'] = pd.timedelta_range('1 day', periods=3) self.check_round_trip(df, fp, write_kwargs={'compression': None}) From 2bc35cfbf3d16f977f24c5a5da6894758e0a70b2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 6 Dec 2017 15:59:02 +0100 Subject: [PATCH 2/6] DOC: document differences between pyarrow and fastparquet in supported data types --- doc/source/io.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index f96e33dbf9882..49e264c8562d0 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4522,6 +4522,7 @@ See the documentation for `pyarrow `__ and .. note:: These engines are very similar and should read/write nearly identical parquet format files. + Currently ``pyarrow`` does not support timedelta data, and ``fastparquet`` does not support timezone aware datetimes (they are coerced to UTC). These libraries differ by having different underlying dependencies (``fastparquet`` by using ``numba``, while ``pyarrow`` uses a c-library). .. ipython:: python @@ -4548,8 +4549,8 @@ Read from a parquet file. .. ipython:: python - result = pd.read_parquet('example_pa.parquet', engine='pyarrow') result = pd.read_parquet('example_fp.parquet', engine='fastparquet') + result = pd.read_parquet('example_pa.parquet', engine='pyarrow') result.dtypes From c200885a10015feeffe500bcca0faf941f463ddf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 6 Dec 2017 15:59:58 +0100 Subject: [PATCH 3/6] TST: enable pyarrow/fastparquet cross compatibility tests on smaller subset of dataframe --- pandas/tests/io/test_parquet.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 4520e95f563b9..84c3af06d6d35 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -80,13 +80,14 @@ def df_compat(): def df_cross_compat(): df = pd.DataFrame({'a': list('abc'), 'b': list(range(1, 4)), - 'c': np.arange(3, 6).astype('u1'), + # 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.date_range('20130101', periods=3), - 'g': pd.date_range('20130101', periods=3, - tz='US/Eastern'), - 'h': pd.date_range('20130101', periods=3, freq='ns')}) + # 'g': pd.date_range('20130101', periods=3, + # tz='US/Eastern'), + # 'h': pd.date_range('20130101', periods=3, freq='ns') + }) return df @@ -173,7 +174,6 @@ def test_options_get_engine(fp, pa): assert isinstance(get_engine('fastparquet'), FastParquetImpl) -@pytest.mark.xfail(reason="fp does not ignore pa index __index_level_0__") def test_cross_engine_pa_fp(df_cross_compat, pa, fp): # cross-compat with differing reading/writing engines @@ -185,7 +185,6 @@ def test_cross_engine_pa_fp(df_cross_compat, pa, fp): tm.assert_frame_equal(result, df) -@pytest.mark.xfail(reason="pyarrow reading fp in some cases") def test_cross_engine_fp_pa(df_cross_compat, pa, fp): # cross-compat with differing reading/writing engines From 55ce01c39fee375c7c4592c1a8eaf83e416140f9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 6 Dec 2017 17:27:30 +0100 Subject: [PATCH 4/6] only test datetime tz for pyarrow >= 0.7 --- pandas/tests/io/test_parquet.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 84c3af06d6d35..20a6d08948e9c 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -323,8 +323,10 @@ def test_basic(self, pa, df_full): df = df_full # additional supported types for pyarrow - df['datetime_tz'] = pd.date_range('20130101', periods=3, - tz='Europe/Brussels') + import pyarrow + if LooseVersion(pyarrow.__version__) >= LooseVersion('0.7.0'): + df['datetime_tz'] = pd.date_range('20130101', periods=3, + tz='Europe/Brussels') df['bool_with_none'] = [True, None, True] self.check_round_trip(df, pa) From b05ae5d7a015539401f15e367b96baf8b83d3199 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 7 Dec 2017 14:29:56 +0100 Subject: [PATCH 5/6] skip pa -> fp cross-compat on windows --- pandas/tests/io/test_parquet.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 20a6d08948e9c..5b08de75c64bb 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd -from pandas.compat import PY3 +from pandas.compat import PY3, is_platform_windows from pandas.io.parquet import (to_parquet, read_parquet, get_engine, PyArrowImpl, FastParquetImpl) from pandas.util import testing as tm @@ -174,6 +174,8 @@ def test_options_get_engine(fp, pa): assert isinstance(get_engine('fastparquet'), FastParquetImpl) +@pytest.mark.xfail(is_platform_windows(), + reason="reading pa metadata failing on Windows") def test_cross_engine_pa_fp(df_cross_compat, pa, fp): # cross-compat with differing reading/writing engines From 6161f656f6cf27441bec3bd6da6ed0c0f15d37fa Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Dec 2017 09:46:58 +0100 Subject: [PATCH 6/6] Add columns= case in cross-compat tests --- pandas/tests/io/test_parquet.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 5b08de75c64bb..c743c5d9fecd5 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -186,6 +186,9 @@ def test_cross_engine_pa_fp(df_cross_compat, pa, fp): result = read_parquet(path, engine=fp) tm.assert_frame_equal(result, df) + result = read_parquet(path, engine=fp, columns=['a', 'd']) + tm.assert_frame_equal(result, df[['a', 'd']]) + def test_cross_engine_fp_pa(df_cross_compat, pa, fp): # cross-compat with differing reading/writing engines @@ -197,6 +200,9 @@ def test_cross_engine_fp_pa(df_cross_compat, pa, fp): result = read_parquet(path, engine=pa) tm.assert_frame_equal(result, df) + result = read_parquet(path, engine=pa, columns=['a', 'd']) + tm.assert_frame_equal(result, df[['a', 'd']]) + class Base(object):