Skip to content

Commit 6630c4e

Browse files
authored
COMPAT: pyarrow >= 0.7.0 compat (#17588)
closes #17581
1 parent 0e85ca7 commit 6630c4e

File tree

2 files changed

+38
-8
lines changed

2 files changed

+38
-8
lines changed

doc/source/io.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -4492,7 +4492,7 @@ Several caveats.
44924492
- The format will NOT write an ``Index``, or ``MultiIndex`` for the ``DataFrame`` and will raise an
44934493
error if a non-default one is provided. You can simply ``.reset_index(drop=True)`` in order to store the index.
44944494
- Duplicate column names and non-string columns names are not supported
4495-
- Categorical dtypes are currently not-supported (for ``pyarrow``).
4495+
- Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype.
44964496
- Non supported types include ``Period`` and actual python object types. These will raise a helpful error message
44974497
on an attempt at serialization.
44984498

pandas/tests/io/test_parquet.py

+37-7
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22

33
import pytest
44
import datetime
5+
from distutils.version import LooseVersion
56
from warnings import catch_warnings
67

78
import numpy as np
89
import pandas as pd
9-
from pandas.compat import PY3, is_platform_windows
10+
from pandas.compat import PY3
1011
from pandas.io.parquet import (to_parquet, read_parquet, get_engine,
1112
PyArrowImpl, FastParquetImpl)
1213
from pandas.util import testing as tm
@@ -42,8 +43,24 @@ def engine(request):
4243
def pa():
4344
if not _HAVE_PYARROW:
4445
pytest.skip("pyarrow is not installed")
45-
if is_platform_windows():
46-
pytest.skip("pyarrow-parquet not building on windows")
46+
return 'pyarrow'
47+
48+
49+
@pytest.fixture
50+
def pa_lt_070():
51+
if not _HAVE_PYARROW:
52+
pytest.skip("pyarrow is not installed")
53+
if LooseVersion(pyarrow.__version__) >= '0.7.0':
54+
pytest.skip("pyarrow is >= 0.7.0")
55+
return 'pyarrow'
56+
57+
58+
@pytest.fixture
59+
def pa_ge_070():
60+
if not _HAVE_PYARROW:
61+
pytest.skip("pyarrow is not installed")
62+
if LooseVersion(pyarrow.__version__) < '0.7.0':
63+
pytest.skip("pyarrow is < 0.7.0")
4764
return 'pyarrow'
4865

4966

@@ -302,10 +319,6 @@ def test_unsupported(self, pa):
302319
df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
303320
self.check_error_on_write(df, pa, ValueError)
304321

305-
# categorical
306-
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
307-
self.check_error_on_write(df, pa, NotImplementedError)
308-
309322
# timedelta
310323
df = pd.DataFrame({'a': pd.timedelta_range('1 day',
311324
periods=3)})
@@ -315,6 +328,23 @@ def test_unsupported(self, pa):
315328
df = pd.DataFrame({'a': ['a', 1, 2.0]})
316329
self.check_error_on_write(df, pa, ValueError)
317330

331+
def test_categorical(self, pa_ge_070):
332+
pa = pa_ge_070
333+
334+
# supported in >= 0.7.0
335+
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
336+
337+
# de-serialized as object
338+
expected = df.assign(a=df.a.astype(object))
339+
self.check_round_trip(df, pa, expected)
340+
341+
def test_categorical_unsupported(self, pa_lt_070):
342+
pa = pa_lt_070
343+
344+
# supported in >= 0.7.0
345+
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
346+
self.check_error_on_write(df, pa, NotImplementedError)
347+
318348

319349
class TestParquetFastParquet(Base):
320350

0 commit comments

Comments
 (0)