From 253c6f427e80f1b2f1eb87250e3b41c6cca20ec7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 10 Sep 2019 16:24:36 +0200 Subject: [PATCH 1/3] ENH: Add IntegerArray.__arrow_array__ for custom conversion to Arrow --- pandas/core/arrays/integer.py | 8 ++++++++ pandas/tests/arrays/test_integer.py | 23 +++++++++++++++++++++++ pandas/tests/io/test_parquet.py | 12 ++++++++++++ 3 files changed, 43 insertions(+) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 069d661e6af34..7b03bf35faf25 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -367,6 +367,14 @@ def __array__(self, dtype=None): """ return self._coerce_to_ndarray() + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow as pa + + return pa.array(self._data, mask=self._mask, type=type) + _HANDLED_TYPES = (np.ndarray, numbers.Number) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 31a9a0483081e..31453fec209eb 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -1,3 +1,5 @@ +from distutils.version import LooseVersion + import numpy as np import pytest @@ -19,6 +21,13 @@ from pandas.tests.extension.base import BaseOpsUtil import pandas.util.testing as tm +try: + import pyarrow + + _PYARROW_INSTALLED = True +except ImportError: + _PYARROW_INSTALLED = False + def make_data(): return list(range(8)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100] @@ -817,6 +826,20 @@ def test_ufunc_reduce_raises(values): np.add.reduce(a) +@pytest.mark.skipif( + not _PYARROW_INSTALLED + or _PYARROW_INSTALLED + and LooseVersion(pyarrow.__version__) < LooseVersion("0.14.1.dev"), + reason="pyarrow >= 0.15.0 required", +) +def test_arrow_array(data): + import pyarrow as pa + + arr = pa.array(data) + expected = pa.array(list(data), type=data.dtype.name.lower(), from_pandas=True) + assert arr.equals(expected) + + # TODO(jreback) - these need testing / are broken # shift diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index d634859e72d7b..ab0daee2d4b3c 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -478,6 +478,18 @@ def test_empty_dataframe(self, pa): df = pd.DataFrame() check_round_trip(df, pa) + @td.skip_if_no("pyarrow", min_version="0.14.1.dev") + def test_nullable_integer(self, pa): + df = pd.DataFrame({"a": pd.Series([1, 2, 3], dtype="Int64")}) + # currently de-serialized as plain int + expected = df.assign(a=df.a.astype("int64")) + check_round_trip(df, pa, expected=expected) + + df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) + # if missing values currently de-serialized as float + expected = df.assign(a=df.a.astype("float64")) + check_round_trip(df, pa, expected=expected) + class TestParquetFastParquet(Base): @td.skip_if_no("fastparquet", min_version="0.2.1") From e8142711358720870d1dcfb5ef0682f9927d10ff Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 12 Sep 2019 20:14:02 +0200 Subject: [PATCH 2/3] simplify pyarrow version check in tests --- pandas/tests/arrays/test_integer.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 31453fec209eb..55e25caafc4ee 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -1,8 +1,8 @@ -from distutils.version import LooseVersion - import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.generic import ABCIndexClass import pandas as pd @@ -21,13 +21,6 @@ from pandas.tests.extension.base import BaseOpsUtil import pandas.util.testing as tm -try: - import pyarrow - - _PYARROW_INSTALLED = True -except ImportError: - _PYARROW_INSTALLED = False - def make_data(): return list(range(8)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100] @@ -826,13 +819,9 @@ def test_ufunc_reduce_raises(values): np.add.reduce(a) -@pytest.mark.skipif( - not _PYARROW_INSTALLED - or _PYARROW_INSTALLED - and LooseVersion(pyarrow.__version__) < LooseVersion("0.14.1.dev"), - reason="pyarrow >= 0.15.0 required", -) +@td.skip_if_no("pyarrow", min_version="0.14.1.dev") def test_arrow_array(data): + # protocol added in 0.15.0 import pyarrow as pa arr = pa.array(data) From 1e66165cea41682de930b63932421848713d507f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 12 Sep 2019 22:03:17 +0200 Subject: [PATCH 3/3] add whatsnew --- doc/source/whatsnew/v1.0.0.rst | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 2d15a7e5ccadd..1e03d9df82dd7 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -20,15 +20,19 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -- :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`) -- + .. _whatsnew_1000.enhancements.other: Other enhancements ^^^^^^^^^^^^^^^^^^ -- +- :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`) +- The :ref:`integer dtype ` with support for missing values can now be converted to + ``pyarrow`` (>= 0.15.0), which means that it is supported in writing to the Parquet file format + when using the ``pyarrow`` engine. It is currently not yet supported when converting back to + pandas (so it will become an integer or float dtype depending on the presence of missing data). + (:issue:`28368`) - .. _whatsnew_1000.api_breaking: