pandas-dev · jorisvandenbossche · Jan 9, 2020 · Sep 10, 2019 · Sep 10, 2019 · Sep 10, 2019
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -4714,8 +4714,8 @@ Several caveats.
 * Index level names, if specified, must be strings.
 * In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype.
 * The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag.
-* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
-  on an attempt at serialization.
+* Non supported types include ``Interval`` and actual Python object types. These will raise a helpful error message
+  on an attempt at serialization. ``Period`` type is supported with pyarrow >= 1.0.0.
 * The ``pyarrow`` engine preserves extension data types such as the nullable integer and string data
   type (requiring pyarrow >= 1.0.0, and requiring the extension type to implement the needed protocols,
   see the :ref:`extension types documentation <extending.extension.arrow>`).

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -200,10 +200,10 @@ Other enhancements
 - Added ``encoding`` argument to :meth:`DataFrame.to_string` for non-ascii text (:issue:`28766`)
 - Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`)
 - :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`)
-- :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`)
-- Roundtripping DataFrames with nullable integer or string data types to parquet
+- Roundtripping DataFrames with nullable integer, string and period data types to parquet
   (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine
-  now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`).
+  now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`, :issue:`28371`).
+- :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`)
 
 Build Changes
 ^^^^^^^^^^^^^

diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -1,3 +1,5 @@
+from distutils.version import LooseVersion
+import json
 from operator import le, lt
 import textwrap
 
@@ -39,6 +41,14 @@
 import pandas.core.common as com
 from pandas.core.indexes.base import ensure_index
 
+try:
+    import pyarrow
+
+    _PYARROW_INSTALLED = True
+except ImportError:
+    _PYARROW_INSTALLED = False
+
+
 _VALID_CLOSED = {"left", "right", "both", "neither"}
 _interval_shared_docs = {}
 
@@ -1026,6 +1036,58 @@ def __array__(self, dtype=None):
                 result[i] = Interval(left[i], right[i], closed)
         return result
 
+    def __arrow_array__(self, type=None):
+        """
+        Convert myself into a pyarrow Array.
+        """
+        import pyarrow as pa
+
+        try:
+            subtype = pa.from_numpy_dtype(self.dtype.subtype)
+        except TypeError:
+            raise TypeError(
+                "Conversion to arrow with subtype '{}' "
+                "is not supported".format(self.dtype.subtype)
+            )
+        interval_type = ArrowIntervalType(subtype, self.closed)
+        storage_array = pa.StructArray.from_arrays(
+            [
+                pa.array(self.left, type=subtype, from_pandas=True),
+                pa.array(self.right, type=subtype, from_pandas=True),
+            ],
+            names=["left", "right"],
+        )
+        mask = self.isna()
+        if mask.any():
+            # if there are missing values, set validity bitmap also on the array level
+            null_bitmap = pa.array(~mask).buffers()[1]
+            storage_array = pa.StructArray.from_buffers(
+                storage_array.type,
+                len(storage_array),
+                [null_bitmap],
+                children=[storage_array.field(0), storage_array.field(1)],
+            )
+
+        if type is not None:
+            if type.equals(interval_type.storage_type):
+                return storage_array
+            elif isinstance(type, ArrowIntervalType):
+                # ensure we have the same subtype and closed attributes
+                if not type.equals(interval_type):
+                    raise TypeError(
+                        "Not supported to convert IntervalArray to type with "
+                        "different 'subtype' ({0} vs {1}) and 'closed' ({2} vs {3}) "
+                        "attributes".format(
+                            self.dtype.subtype, type.subtype, self.closed, type.closed
+                        )
+                    )
+            else:
+                raise TypeError(
+                    "Not supported to convert IntervalArray to '{0}' type".format(type)
+                )
+
+        return pa.ExtensionArray.from_storage(interval_type, storage_array)
+
     _interval_shared_docs[
         "to_tuples"
     ] = """
@@ -1217,3 +1279,55 @@ def maybe_convert_platform_interval(values):
         values = np.asarray(values)
 
     return maybe_convert_platform(values)
+
+
+if _PYARROW_INSTALLED and LooseVersion(pyarrow.__version__) >= LooseVersion("0.15"):
+
+    class ArrowIntervalType(pyarrow.ExtensionType):
+        def __init__(self, subtype, closed):
+            # attributes need to be set first before calling
+            # super init (as that calls serialize)
+            assert closed in _VALID_CLOSED
+            self._closed = closed
+            if not isinstance(subtype, pyarrow.DataType):
+                subtype = pyarrow.type_for_alias(str(subtype))
+            self._subtype = subtype
+
+            storage_type = pyarrow.struct([("left", subtype), ("right", subtype)])
+            pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
+
+        @property
+        def subtype(self):
+            return self._subtype
+
+        @property
+        def closed(self):
+            return self._closed
+
+        def __arrow_ext_serialize__(self):
+            metadata = {"subtype": str(self.subtype), "closed": self.closed}
+            return json.dumps(metadata).encode()
+
+        @classmethod
+        def __arrow_ext_deserialize__(cls, storage_type, serialized):
+            metadata = json.loads(serialized.decode())
+            subtype = pyarrow.type_for_alias(metadata["subtype"])
+            closed = metadata["closed"]
+            return ArrowIntervalType(subtype, closed)
+
+        def __eq__(self, other):
+            if isinstance(other, pyarrow.BaseExtensionType):
+                return (
+                    type(self) == type(other)
+                    and self.subtype == other.subtype
+                    and self.closed == other.closed
+                )
+            else:
+                return NotImplemented
+
+        def __hash__(self):
+            return hash((str(self), str(self.subtype), self.closed))
+
+    # register the type with a dummy instance
+    _interval_type = ArrowIntervalType(pyarrow.int64(), "left")
+    pyarrow.register_extension_type(_interval_type)
diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
@@ -1,4 +1,6 @@
 from datetime import timedelta
+from distutils.version import LooseVersion
+import json
 import operator
 from typing import Any, Callable, List, Optional, Sequence, Union
 
@@ -49,6 +51,13 @@
 from pandas.tseries import frequencies
 from pandas.tseries.offsets import DateOffset, Tick, _delta_to_tick
 
+try:
+    import pyarrow
+
+    _PYARROW_INSTALLED = True
+except ImportError:
+    _PYARROW_INSTALLED = False
+
 
 def _field_accessor(name, alias, docstring=None):
     def f(self):
@@ -332,6 +341,31 @@ def __array__(self, dtype=None):
         # overriding DatetimelikeArray
         return np.array(list(self), dtype=object)
 
+    def __arrow_array__(self, type=None):
+        """
+        Convert myself into a pyarrow Array.
+        """
+        import pyarrow as pa
+
+        if type is not None:
+            if pa.types.is_integer(type):
+                return pa.array(self._data, mask=self.isna(), type=type)
+            elif isinstance(type, ArrowPeriodType):
+                # ensure we have the same freq
+                if self.freqstr != type.freq:
+                    raise TypeError(
+                        "Not supported to convert PeriodArray to array with different"
+                        " 'freq' ({0} vs {1})".format(self.freqstr, type.freq)
+                    )
+            else:
+                raise TypeError(
+                    "Not supported to convert PeriodArray to '{0}' type".format(type)
+                )
+
+        period_type = ArrowPeriodType(self.freqstr)
+        storage_array = pa.array(self._data, mask=self.isna(), type="int64")
+        return pa.ExtensionArray.from_storage(period_type, storage_array)
+
     # --------------------------------------------------------------------
     # Vectorized analogues of Period properties
 
@@ -1074,3 +1108,39 @@ def _make_field_arrays(*fields):
     ]
 
     return arrays
+
+
+if _PYARROW_INSTALLED and LooseVersion(pyarrow.__version__) >= LooseVersion("0.15"):
+
+    class ArrowPeriodType(pyarrow.ExtensionType):
+        def __init__(self, freq):
+            # attributes need to be set first before calling
+            # super init (as that calls serialize)
+            self._freq = freq
+            pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period")
+
+        @property
+        def freq(self):
+            return self._freq
+
+        def __arrow_ext_serialize__(self):
+            metadata = {"freq": self.freq}
+            return json.dumps(metadata).encode()
+
+        @classmethod
+        def __arrow_ext_deserialize__(cls, storage_type, serialized):
+            metadata = json.loads(serialized.decode())
+            return ArrowPeriodType(metadata["freq"])
+
+        def __eq__(self, other):
+            if isinstance(other, pyarrow.BaseExtensionType):
+                return type(self) == type(other) and self.freq == other.freq
+            else:
+                return NotImplemented
+
+        def __hash__(self):
+            return hash((str(self), self.freq))
+
+    # register the type with a dummy instance
+    _period_type = ArrowPeriodType("D")
+    pyarrow.register_extension_type(_period_type)
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -943,6 +943,36 @@ def construct_array_type(cls):
 
         return PeriodArray
 
+    def __from_arrow__(self, array):
+        """Construct PeriodArray from pyarrow Array/ChunkedArray."""
+        import pyarrow
+        from pandas.core.arrays import PeriodArray
+
+        if isinstance(array, pyarrow.Array):
+            chunks = [array]
+        else:
+            chunks = array.chunks
+
+        results = []
+        for arr in chunks:
+            buflist = arr.buffers()
+            data = np.frombuffer(buflist[-1], dtype="int64")[
+                arr.offset : arr.offset + len(arr)
+            ]
+            bitmask = buflist[0]
+            if bitmask is not None:
+                mask = pyarrow.BooleanArray.from_buffers(
+                    pyarrow.bool_(), len(arr), [None, bitmask]
+                )
+                mask = np.asarray(mask)
+            else:
+                mask = np.ones(len(arr), dtype=bool)
+            parr = PeriodArray(data.copy(), freq=self.freq, copy=False)
+            parr[~mask] = NaT
+            results.append(parr)
+
+        return PeriodArray._concat_same_type(results)
+
 
 @register_extension_dtype
 class IntervalDtype(PandasExtensionDtype):
@@ -1114,3 +1144,22 @@ def is_dtype(cls, dtype) -> bool:
             else:
                 return False
         return super().is_dtype(dtype)
+
+    def __from_arrow__(self, array):
+        """Construct IntervalArray from pyarrow Array/ChunkedArray."""
+        import pyarrow
+        from pandas.core.arrays import IntervalArray
+
+        if isinstance(array, pyarrow.Array):
+            chunks = [array]
+        else:
+            chunks = array.chunks
+
+        results = []
+        for arr in chunks:
+            left = np.asarray(arr.storage.field("left"), dtype=self.subtype)
+            right = np.asarray(arr.storage.field("right"), dtype=self.subtype)
+            iarr = IntervalArray.from_arrays(left, right, closed=array.type.closed)
+            results.append(iarr)
+
+        return IntervalArray._concat_same_type(results)