Skip to content

TST/DOC: test pyarrow tz data + doc / enable cross compat tests for pyarrow/fastparquet #18662

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4522,6 +4522,7 @@ See the documentation for `pyarrow <http://arrow.apache.org/docs/python/>`__ and
.. note::

These engines are very similar and should read/write nearly identical parquet format files.
Currently ``pyarrow`` does not support timedelta data, and ``fastparquet`` does not support timezone aware datetimes (they are coerced to UTC).
These libraries differ by having different underlying dependencies (``fastparquet`` by using ``numba``, while ``pyarrow`` uses a c-library).

.. ipython:: python
Expand All @@ -4548,8 +4549,8 @@ Read from a parquet file.

.. ipython:: python

result = pd.read_parquet('example_pa.parquet', engine='pyarrow')
result = pd.read_parquet('example_fp.parquet', engine='fastparquet')
result = pd.read_parquet('example_pa.parquet', engine='pyarrow')

result.dtypes

Expand Down
103 changes: 60 additions & 43 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import numpy as np
import pandas as pd
from pandas.compat import PY3
from pandas.compat import PY3, is_platform_windows
from pandas.io.parquet import (to_parquet, read_parquet, get_engine,
PyArrowImpl, FastParquetImpl)
from pandas.util import testing as tm
Expand Down Expand Up @@ -80,16 +80,36 @@ def df_compat():
def df_cross_compat():
df = pd.DataFrame({'a': list('abc'),
'b': list(range(1, 4)),
'c': np.arange(3, 6).astype('u1'),
# 'c': np.arange(3, 6).astype('u1'),
'd': np.arange(4.0, 7.0, dtype='float64'),
'e': [True, False, True],
'f': pd.date_range('20130101', periods=3),
'g': pd.date_range('20130101', periods=3,
tz='US/Eastern'),
'h': pd.date_range('20130101', periods=3, freq='ns')})
# 'g': pd.date_range('20130101', periods=3,
# tz='US/Eastern'),
# 'h': pd.date_range('20130101', periods=3, freq='ns')
})
return df


@pytest.fixture
def df_full():
return pd.DataFrame(
{'string': list('abc'),
'string_with_nan': ['a', np.nan, 'c'],
'string_with_none': ['a', None, 'c'],
'bytes': [b'foo', b'bar', b'baz'],
'unicode': [u'foo', u'bar', u'baz'],
'int': list(range(1, 4)),
'uint': np.arange(3, 6).astype('u1'),
'float': np.arange(4.0, 7.0, dtype='float64'),
'float_with_nan': [2., np.nan, 3.],
'bool': [True, False, True],
'datetime': pd.date_range('20130101', periods=3),
'datetime_with_nat': [pd.Timestamp('20130101'),
pd.NaT,
pd.Timestamp('20130103')]})


def test_invalid_engine(df_compat):

with pytest.raises(ValueError):
Expand Down Expand Up @@ -154,7 +174,8 @@ def test_options_get_engine(fp, pa):
assert isinstance(get_engine('fastparquet'), FastParquetImpl)


@pytest.mark.xfail(reason="fp does not ignore pa index __index_level_0__")
@pytest.mark.xfail(is_platform_windows(),
reason="reading pa metadata failing on Windows")
def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
# cross-compat with differing reading/writing engines

Expand All @@ -165,8 +186,10 @@ def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
result = read_parquet(path, engine=fp)
tm.assert_frame_equal(result, df)

result = read_parquet(path, engine=fp, columns=['a', 'd'])
tm.assert_frame_equal(result, df[['a', 'd']])


@pytest.mark.xfail(reason="pyarrow reading fp in some cases")
def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
# cross-compat with differing reading/writing engines

Expand All @@ -177,6 +200,9 @@ def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
result = read_parquet(path, engine=pa)
tm.assert_frame_equal(result, df)

result = read_parquet(path, engine=pa, columns=['a', 'd'])
tm.assert_frame_equal(result, df[['a', 'd']])


class Base(object):

Expand Down Expand Up @@ -300,27 +326,31 @@ def test_read_columns(self, engine):

class TestParquetPyArrow(Base):

def test_basic(self, pa):
def test_basic(self, pa, df_full):

df = pd.DataFrame({'string': list('abc'),
'string_with_nan': ['a', np.nan, 'c'],
'string_with_none': ['a', None, 'c'],
'bytes': [b'foo', b'bar', b'baz'],
'unicode': [u'foo', u'bar', u'baz'],
'int': list(range(1, 4)),
'uint': np.arange(3, 6).astype('u1'),
'float': np.arange(4.0, 7.0, dtype='float64'),
'float_with_nan': [2., np.nan, 3.],
'bool': [True, False, True],
'bool_with_none': [True, None, True],
'datetime_ns': pd.date_range('20130101', periods=3),
'datetime_with_nat': [pd.Timestamp('20130101'),
pd.NaT,
pd.Timestamp('20130103')]
})
df = df_full

# additional supported types for pyarrow
import pyarrow
if LooseVersion(pyarrow.__version__) >= LooseVersion('0.7.0'):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can remove this after we change the dep (@dhirschfeld PR)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep. For the rest any comments?

df['datetime_tz'] = pd.date_range('20130101', periods=3,
tz='Europe/Brussels')
df['bool_with_none'] = [True, None, True]

self.check_round_trip(df, pa)

@pytest.mark.xfail(reason="pyarrow fails on this (ARROW-1883)")
def test_basic_subset_columns(self, pa, df_full):
# GH18628

df = df_full
# additional supported types for pyarrow
df['datetime_tz'] = pd.date_range('20130101', periods=3,
tz='Europe/Brussels')

self.check_round_trip(df, pa, expected=df[['string', 'int']],
read_kwargs={'columns': ['string', 'int']})

def test_duplicate_columns(self, pa):

# not currently able to handle duplicate columns
Expand Down Expand Up @@ -363,25 +393,12 @@ def test_categorical_unsupported(self, pa_lt_070):

class TestParquetFastParquet(Base):

def test_basic(self, fp):

df = pd.DataFrame(
{'string': list('abc'),
'string_with_nan': ['a', np.nan, 'c'],
'string_with_none': ['a', None, 'c'],
'bytes': [b'foo', b'bar', b'baz'],
'unicode': [u'foo', u'bar', u'baz'],
'int': list(range(1, 4)),
'uint': np.arange(3, 6).astype('u1'),
'float': np.arange(4.0, 7.0, dtype='float64'),
'float_with_nan': [2., np.nan, 3.],
'bool': [True, False, True],
'datetime': pd.date_range('20130101', periods=3),
'datetime_with_nat': [pd.Timestamp('20130101'),
pd.NaT,
pd.Timestamp('20130103')],
'timedelta': pd.timedelta_range('1 day', periods=3),
})
def test_basic(self, fp, df_full):

df = df_full

# additional supported types for fastparquet
df['timedelta'] = pd.timedelta_range('1 day', periods=3)

self.check_round_trip(df, fp, write_kwargs={'compression': None})

Expand Down