Skip to content

Commit 371649b

Browse files
TST/DOC: test pyarrow tz data + doc / enable cross compat tests for pyarrow/fastparquet (#18662)
* TST: add parquet test with tz datetime data for pyarrow + clean-up basic data types tests: make common dataframe with types supported by both pyarrow and fastparquet * DOC: document differences between pyarrow and fastparquet in supported data types * TST: enable pyarrow/fastparquet cross compatibility tests on smaller subset of dataframe * only test datetime tz for pyarrow >= 0.7 * skip pa -> fp cross-compat on windows * Add columns= case in cross-compat tests
1 parent 16de5f9 commit 371649b

File tree

2 files changed

+62
-44
lines changed

2 files changed

+62
-44
lines changed

doc/source/io.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -4522,6 +4522,7 @@ See the documentation for `pyarrow <http://arrow.apache.org/docs/python/>`__ and
45224522
.. note::
45234523

45244524
These engines are very similar and should read/write nearly identical parquet format files.
4525+
Currently ``pyarrow`` does not support timedelta data, and ``fastparquet`` does not support timezone aware datetimes (they are coerced to UTC).
45254526
These libraries differ by having different underlying dependencies (``fastparquet`` by using ``numba``, while ``pyarrow`` uses a c-library).
45264527

45274528
.. ipython:: python
@@ -4548,8 +4549,8 @@ Read from a parquet file.
45484549

45494550
.. ipython:: python
45504551
4551-
result = pd.read_parquet('example_pa.parquet', engine='pyarrow')
45524552
result = pd.read_parquet('example_fp.parquet', engine='fastparquet')
4553+
result = pd.read_parquet('example_pa.parquet', engine='pyarrow')
45534554
45544555
result.dtypes
45554556

pandas/tests/io/test_parquet.py

+60-43
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
import numpy as np
99
import pandas as pd
10-
from pandas.compat import PY3
10+
from pandas.compat import PY3, is_platform_windows
1111
from pandas.io.parquet import (to_parquet, read_parquet, get_engine,
1212
PyArrowImpl, FastParquetImpl)
1313
from pandas.util import testing as tm
@@ -80,16 +80,36 @@ def df_compat():
8080
def df_cross_compat():
8181
df = pd.DataFrame({'a': list('abc'),
8282
'b': list(range(1, 4)),
83-
'c': np.arange(3, 6).astype('u1'),
83+
# 'c': np.arange(3, 6).astype('u1'),
8484
'd': np.arange(4.0, 7.0, dtype='float64'),
8585
'e': [True, False, True],
8686
'f': pd.date_range('20130101', periods=3),
87-
'g': pd.date_range('20130101', periods=3,
88-
tz='US/Eastern'),
89-
'h': pd.date_range('20130101', periods=3, freq='ns')})
87+
# 'g': pd.date_range('20130101', periods=3,
88+
# tz='US/Eastern'),
89+
# 'h': pd.date_range('20130101', periods=3, freq='ns')
90+
})
9091
return df
9192

9293

94+
@pytest.fixture
95+
def df_full():
96+
return pd.DataFrame(
97+
{'string': list('abc'),
98+
'string_with_nan': ['a', np.nan, 'c'],
99+
'string_with_none': ['a', None, 'c'],
100+
'bytes': [b'foo', b'bar', b'baz'],
101+
'unicode': [u'foo', u'bar', u'baz'],
102+
'int': list(range(1, 4)),
103+
'uint': np.arange(3, 6).astype('u1'),
104+
'float': np.arange(4.0, 7.0, dtype='float64'),
105+
'float_with_nan': [2., np.nan, 3.],
106+
'bool': [True, False, True],
107+
'datetime': pd.date_range('20130101', periods=3),
108+
'datetime_with_nat': [pd.Timestamp('20130101'),
109+
pd.NaT,
110+
pd.Timestamp('20130103')]})
111+
112+
93113
def test_invalid_engine(df_compat):
94114

95115
with pytest.raises(ValueError):
@@ -154,7 +174,8 @@ def test_options_get_engine(fp, pa):
154174
assert isinstance(get_engine('fastparquet'), FastParquetImpl)
155175

156176

157-
@pytest.mark.xfail(reason="fp does not ignore pa index __index_level_0__")
177+
@pytest.mark.xfail(is_platform_windows(),
178+
reason="reading pa metadata failing on Windows")
158179
def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
159180
# cross-compat with differing reading/writing engines
160181

@@ -165,8 +186,10 @@ def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
165186
result = read_parquet(path, engine=fp)
166187
tm.assert_frame_equal(result, df)
167188

189+
result = read_parquet(path, engine=fp, columns=['a', 'd'])
190+
tm.assert_frame_equal(result, df[['a', 'd']])
191+
168192

169-
@pytest.mark.xfail(reason="pyarrow reading fp in some cases")
170193
def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
171194
# cross-compat with differing reading/writing engines
172195

@@ -177,6 +200,9 @@ def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
177200
result = read_parquet(path, engine=pa)
178201
tm.assert_frame_equal(result, df)
179202

203+
result = read_parquet(path, engine=pa, columns=['a', 'd'])
204+
tm.assert_frame_equal(result, df[['a', 'd']])
205+
180206

181207
class Base(object):
182208

@@ -300,27 +326,31 @@ def test_read_columns(self, engine):
300326

301327
class TestParquetPyArrow(Base):
302328

303-
def test_basic(self, pa):
329+
def test_basic(self, pa, df_full):
304330

305-
df = pd.DataFrame({'string': list('abc'),
306-
'string_with_nan': ['a', np.nan, 'c'],
307-
'string_with_none': ['a', None, 'c'],
308-
'bytes': [b'foo', b'bar', b'baz'],
309-
'unicode': [u'foo', u'bar', u'baz'],
310-
'int': list(range(1, 4)),
311-
'uint': np.arange(3, 6).astype('u1'),
312-
'float': np.arange(4.0, 7.0, dtype='float64'),
313-
'float_with_nan': [2., np.nan, 3.],
314-
'bool': [True, False, True],
315-
'bool_with_none': [True, None, True],
316-
'datetime_ns': pd.date_range('20130101', periods=3),
317-
'datetime_with_nat': [pd.Timestamp('20130101'),
318-
pd.NaT,
319-
pd.Timestamp('20130103')]
320-
})
331+
df = df_full
332+
333+
# additional supported types for pyarrow
334+
import pyarrow
335+
if LooseVersion(pyarrow.__version__) >= LooseVersion('0.7.0'):
336+
df['datetime_tz'] = pd.date_range('20130101', periods=3,
337+
tz='Europe/Brussels')
338+
df['bool_with_none'] = [True, None, True]
321339

322340
self.check_round_trip(df, pa)
323341

342+
@pytest.mark.xfail(reason="pyarrow fails on this (ARROW-1883)")
343+
def test_basic_subset_columns(self, pa, df_full):
344+
# GH18628
345+
346+
df = df_full
347+
# additional supported types for pyarrow
348+
df['datetime_tz'] = pd.date_range('20130101', periods=3,
349+
tz='Europe/Brussels')
350+
351+
self.check_round_trip(df, pa, expected=df[['string', 'int']],
352+
read_kwargs={'columns': ['string', 'int']})
353+
324354
def test_duplicate_columns(self, pa):
325355

326356
# not currently able to handle duplicate columns
@@ -363,25 +393,12 @@ def test_categorical_unsupported(self, pa_lt_070):
363393

364394
class TestParquetFastParquet(Base):
365395

366-
def test_basic(self, fp):
367-
368-
df = pd.DataFrame(
369-
{'string': list('abc'),
370-
'string_with_nan': ['a', np.nan, 'c'],
371-
'string_with_none': ['a', None, 'c'],
372-
'bytes': [b'foo', b'bar', b'baz'],
373-
'unicode': [u'foo', u'bar', u'baz'],
374-
'int': list(range(1, 4)),
375-
'uint': np.arange(3, 6).astype('u1'),
376-
'float': np.arange(4.0, 7.0, dtype='float64'),
377-
'float_with_nan': [2., np.nan, 3.],
378-
'bool': [True, False, True],
379-
'datetime': pd.date_range('20130101', periods=3),
380-
'datetime_with_nat': [pd.Timestamp('20130101'),
381-
pd.NaT,
382-
pd.Timestamp('20130103')],
383-
'timedelta': pd.timedelta_range('1 day', periods=3),
384-
})
396+
def test_basic(self, fp, df_full):
397+
398+
df = df_full
399+
400+
# additional supported types for fastparquet
401+
df['timedelta'] = pd.timedelta_range('1 day', periods=3)
385402

386403
self.check_round_trip(df, fp, write_kwargs={'compression': None})
387404

0 commit comments

Comments
 (0)