pandas-dev · jorisvandenbossche · Jan 9, 2020 · Sep 10, 2019 · Sep 10, 2019 · Sep 10, 2019
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -4648,10 +4648,10 @@ Several caveats.
 * Index level names, if specified, must be strings.
 * In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype.
 * The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag.
-* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
-  on an attempt at serialization.
+* Non supported types include ``Interval`` and actual Python object types. These will raise a helpful error message
+  on an attempt at serialization. ``Period`` type is supported with pyarrow >= 0.16.0.
 * The ``pyarrow`` engine preserves extension data types such as the nullable integer and string data
-  type (requiring pyarrow >= 1.0.0, and requiring the extension type to implement the needed protocols,
+  type (requiring pyarrow >= 0.16.0, and requiring the extension type to implement the needed protocols,
   see the :ref:`extension types documentation <extending.extension.arrow>`).
 
 You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``.

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -204,9 +204,9 @@ Other enhancements
 - Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`)
 - :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`)
 - :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`)
-- Roundtripping DataFrames with nullable integer or string data types to parquet
+- Roundtripping DataFrames with nullable integer, string and period data types to parquet
   (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine
-  now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`).
+  now preserve those data types with pyarrow >= 0.16.0 (:issue:`20612`, :issue:`28371`).
 - The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`)
 - :func:`pandas.read_json` now parses ``NaN``, ``Infinity`` and ``-Infinity`` (:issue:`12213`)
 - The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`)
@@ -220,7 +220,6 @@ Other enhancements
 - The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30296`)
 - :meth:`Timestamp.fromisocalendar` is now compatible with python 3.8 and above (:issue:`28115`)
 
-
 Build Changes
 ^^^^^^^^^^^^^
 

diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py
@@ -0,0 +1,124 @@
+from distutils.version import LooseVersion
+import json
+
+import numpy as np
+import pyarrow
+
+from pandas.core.arrays.interval import _VALID_CLOSED
+
+_pyarrow_version_ge_015 = LooseVersion(pyarrow.__version__) >= LooseVersion("0.15")
+
+
+def pyarrow_array_to_numpy_and_mask(arr, dtype):
+    """
+    Convert a primitive pyarrow.Array to a numpy array and boolean mask based
+    on the buffers of the Array.
+
+    Parameters
+    ----------
+    arr : pyarrow.Array
+    dtype : numpy.dtype
+
+    Returns
+    -------
+    (data, mask)
+        Tuple of two numpy arrays with the raw data (with specified dtype) and
+        a boolean mask (validity mask, so False means missing)
+    """
+    buflist = arr.buffers()
+    data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)]
+    bitmask = buflist[0]
+    if bitmask is not None:
+        mask = pyarrow.BooleanArray.from_buffers(
+            pyarrow.bool_(), len(arr), [None, bitmask]
+        )
+        mask = np.asarray(mask)
+    else:
+        mask = np.ones(len(arr), dtype=bool)
+    return data, mask
+
+
+if _pyarrow_version_ge_015:
+    # the pyarrow extension types are only available for pyarrow 0.15+
+
+    class ArrowPeriodType(pyarrow.ExtensionType):
+        def __init__(self, freq):
+            # attributes need to be set first before calling
+            # super init (as that calls serialize)
+            self._freq = freq
+            pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period")
+
+        @property
+        def freq(self):
+            return self._freq
+
+        def __arrow_ext_serialize__(self):
+            metadata = {"freq": self.freq}
+            return json.dumps(metadata).encode()
+
+        @classmethod
+        def __arrow_ext_deserialize__(cls, storage_type, serialized):
+            metadata = json.loads(serialized.decode())
+            return ArrowPeriodType(metadata["freq"])
+
+        def __eq__(self, other):
+            if isinstance(other, pyarrow.BaseExtensionType):
+                return type(self) == type(other) and self.freq == other.freq
+            else:
+                return NotImplemented
+
+        def __hash__(self):
+            return hash((str(self), self.freq))
+
+    # register the type with a dummy instance
+    _period_type = ArrowPeriodType("D")
+    pyarrow.register_extension_type(_period_type)
+
+    class ArrowIntervalType(pyarrow.ExtensionType):
+        def __init__(self, subtype, closed):
+            # attributes need to be set first before calling
+            # super init (as that calls serialize)
+            assert closed in _VALID_CLOSED
+            self._closed = closed
+            if not isinstance(subtype, pyarrow.DataType):
+                subtype = pyarrow.type_for_alias(str(subtype))
+            self._subtype = subtype
+
+            storage_type = pyarrow.struct([("left", subtype), ("right", subtype)])
+            pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval")
+
+        @property
+        def subtype(self):
+            return self._subtype
+
+        @property
+        def closed(self):
+            return self._closed
+
+        def __arrow_ext_serialize__(self):
+            metadata = {"subtype": str(self.subtype), "closed": self.closed}
+            return json.dumps(metadata).encode()
+
+        @classmethod
+        def __arrow_ext_deserialize__(cls, storage_type, serialized):
+            metadata = json.loads(serialized.decode())
+            subtype = pyarrow.type_for_alias(metadata["subtype"])
+            closed = metadata["closed"]
+            return ArrowIntervalType(subtype, closed)
+
+        def __eq__(self, other):
+            if isinstance(other, pyarrow.BaseExtensionType):
+                return (
+                    type(self) == type(other)
+                    and self.subtype == other.subtype
+                    and self.closed == other.closed
+                )
+            else:
+                return NotImplemented
+
+        def __hash__(self):
+            return hash((str(self), str(self.subtype), self.closed))
+
+    # register the type with a dummy instance
+    _interval_type = ArrowIntervalType(pyarrow.int64(), "left")
+    pyarrow.register_extension_type(_interval_type)
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -94,6 +94,7 @@ def construct_array_type(cls):
     def __from_arrow__(self, array):
         """Construct IntegerArray from passed pyarrow Array/ChunkedArray"""
         import pyarrow
+        from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
 
         if isinstance(array, pyarrow.Array):
             chunks = [array]
@@ -103,18 +104,7 @@ def __from_arrow__(self, array):
 
         results = []
         for arr in chunks:
-            buflist = arr.buffers()
-            data = np.frombuffer(buflist[1], dtype=self.type)[
-                arr.offset : arr.offset + len(arr)
-            ]
-            bitmask = buflist[0]
-            if bitmask is not None:
-                mask = pyarrow.BooleanArray.from_buffers(
-                    pyarrow.bool_(), len(arr), [None, bitmask]
-                )
-                mask = np.asarray(mask)
-            else:
-                mask = np.ones(len(arr), dtype=bool)
+            data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type)
             int_arr = IntegerArray(data.copy(), ~mask, copy=False)
             results.append(int_arr)
 

diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -1081,6 +1081,59 @@ def __array__(self, dtype=None):
                 result[i] = Interval(left[i], right[i], closed)
         return result
 
+    def __arrow_array__(self, type=None):
+        """
+        Convert myself into a pyarrow Array.
+        """
+        import pyarrow
+        from pandas.core.arrays._arrow_utils import ArrowIntervalType
+
+        try:
+            subtype = pyarrow.from_numpy_dtype(self.dtype.subtype)
+        except TypeError:
+            raise TypeError(
+                "Conversion to arrow with subtype '{}' "
+                "is not supported".format(self.dtype.subtype)
+            )
+        interval_type = ArrowIntervalType(subtype, self.closed)
+        storage_array = pyarrow.StructArray.from_arrays(
+            [
+                pyarrow.array(self.left, type=subtype, from_pandas=True),
+                pyarrow.array(self.right, type=subtype, from_pandas=True),
+            ],
+            names=["left", "right"],
+        )
+        mask = self.isna()
+        if mask.any():
+            # if there are missing values, set validity bitmap also on the array level
+            null_bitmap = pyarrow.array(~mask).buffers()[1]
+            storage_array = pyarrow.StructArray.from_buffers(
+                storage_array.type,
+                len(storage_array),
+                [null_bitmap],
+                children=[storage_array.field(0), storage_array.field(1)],
+            )
+
+        if type is not None:
+            if type.equals(interval_type.storage_type):
+                return storage_array
+            elif isinstance(type, ArrowIntervalType):
+                # ensure we have the same subtype and closed attributes
+                if not type.equals(interval_type):
+                    raise TypeError(
+                        "Not supported to convert IntervalArray to type with "
+                        "different 'subtype' ({0} vs {1}) and 'closed' ({2} vs {3}) "
+                        "attributes".format(
+                            self.dtype.subtype, type.subtype, self.closed, type.closed
+                        )
+                    )
+            else:
+                raise TypeError(
+                    "Not supported to convert IntervalArray to '{0}' type".format(type)
+                )
+
+        return pyarrow.ExtensionArray.from_storage(interval_type, storage_array)
+
     _interval_shared_docs[
         "to_tuples"
     ] = """

diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
@@ -283,6 +283,32 @@ def __array__(self, dtype=None):
         # overriding DatetimelikeArray
         return np.array(list(self), dtype=object)
 
+    def __arrow_array__(self, type=None):
+        """
+        Convert myself into a pyarrow Array.
+        """
+        import pyarrow
+        from pandas.core.arrays._arrow_utils import ArrowPeriodType
+
+        if type is not None:
+            if pyarrow.types.is_integer(type):
+                return pyarrow.array(self._data, mask=self.isna(), type=type)
+            elif isinstance(type, ArrowPeriodType):
+                # ensure we have the same freq
+                if self.freqstr != type.freq:
+                    raise TypeError(
+                        "Not supported to convert PeriodArray to array with different"
+                        " 'freq' ({0} vs {1})".format(self.freqstr, type.freq)
+                    )
+            else:
+                raise TypeError(
+                    "Not supported to convert PeriodArray to '{0}' type".format(type)
+                )
+
+        period_type = ArrowPeriodType(self.freqstr)
+        storage_array = pyarrow.array(self._data, mask=self.isna(), type="int64")
+        return pyarrow.ExtensionArray.from_storage(period_type, storage_array)
+
     # --------------------------------------------------------------------
     # Vectorized analogues of Period properties
 

diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -950,6 +950,26 @@ def construct_array_type(cls):
 
         return PeriodArray
 
+    def __from_arrow__(self, array):
+        """Construct PeriodArray from pyarrow Array/ChunkedArray."""
+        import pyarrow
+        from pandas.core.arrays import PeriodArray
+        from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask
+
+        if isinstance(array, pyarrow.Array):
+            chunks = [array]
+        else:
+            chunks = array.chunks
+
+        results = []
+        for arr in chunks:
+            data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype="int64")
+            parr = PeriodArray(data.copy(), freq=self.freq, copy=False)
+            parr[~mask] = NaT
+            results.append(parr)
+
+        return PeriodArray._concat_same_type(results)
+
 
 @register_extension_dtype
 class IntervalDtype(PandasExtensionDtype):
@@ -1121,3 +1141,22 @@ def is_dtype(cls, dtype) -> bool:
             else:
                 return False
         return super().is_dtype(dtype)
+
+    def __from_arrow__(self, array):
+        """Construct IntervalArray from pyarrow Array/ChunkedArray."""
+        import pyarrow
+        from pandas.core.arrays import IntervalArray
+
+        if isinstance(array, pyarrow.Array):
+            chunks = [array]
+        else:
+            chunks = array.chunks
+
+        results = []
+        for arr in chunks:
+            left = np.asarray(arr.storage.field("left"), dtype=self.subtype)
+            right = np.asarray(arr.storage.field("right"), dtype=self.subtype)
+            iarr = IntervalArray.from_arrays(left, right, closed=array.type.closed)
+            results.append(iarr)
+
+        return IntervalArray._concat_same_type(results)
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -76,6 +76,9 @@ def __init__(self):
         )
         import pyarrow.parquet
 
+        # import utils to register the pyarrow extension types
+        import pandas.core.arrays._arrow_utils  # noqa
+
         self.api = pyarrow
 
     def write(