ENH: add use_nullable_dtypes option in read_parquet (pandas-dev#31242)

jorisvandenbossche · web-flow · commit 7b400b3428c1 · 2020-11-29T11:10:18.000-05:00
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -241,6 +241,10 @@ Other enhancements
 - Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`).
 - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`)
 - :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`)
+- :func:`read_parquet` gained a ``use_nullable_dtypes=True`` option to use
+  nullable dtypes that use ``pd.NA`` as missing value indicator where possible
+  for the resulting DataFrame (default is False, and only applicable for
+  ``engine="pyarrow"``) (:issue:`31242`)
 - Added :meth:`.Rolling.sem` and :meth:`Expanding.sem` to compute the standard error of the mean (:issue:`26476`)
 - :meth:`.Rolling.var` and :meth:`.Rolling.std` use Kahan summation and Welford's Method to avoid numerical issues (:issue:`37051`)
 - :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welford's Method to avoid numerical issues (:issue:`37448`)
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -1,5 +1,6 @@
 """ parquet compat """
 
+from distutils.version import LooseVersion
 import io
 import os
 from typing import Any, AnyStr, Dict, List, Optional, Tuple
@@ -177,10 +178,39 @@ def write(
                 handles.close()
 
     def read(
-        self, path, columns=None, storage_options: StorageOptions = None, **kwargs
+        self,
+        path,
+        columns=None,
+        use_nullable_dtypes=False,
+        storage_options: StorageOptions = None,
+        **kwargs,
     ):
         kwargs["use_pandas_metadata"] = True
 
+        to_pandas_kwargs = {}
+        if use_nullable_dtypes:
+            if LooseVersion(self.api.__version__) >= "0.16":
+                import pandas as pd
+
+                mapping = {
+                    self.api.int8(): pd.Int8Dtype(),
+                    self.api.int16(): pd.Int16Dtype(),
+                    self.api.int32(): pd.Int32Dtype(),
+                    self.api.int64(): pd.Int64Dtype(),
+                    self.api.uint8(): pd.UInt8Dtype(),
+                    self.api.uint16(): pd.UInt16Dtype(),
+                    self.api.uint32(): pd.UInt32Dtype(),
+                    self.api.uint64(): pd.UInt64Dtype(),
+                    self.api.bool_(): pd.BooleanDtype(),
+                    self.api.string(): pd.StringDtype(),
+                }
+                to_pandas_kwargs["types_mapper"] = mapping.get
+            else:
+                raise ValueError(
+                    "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 "
+                    f"({self.api.__version__} is installed"
+                )
+
         path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
             path,
             kwargs.pop("filesystem", None),
@@ -190,7 +220,7 @@ def read(
         try:
             return self.api.parquet.read_table(
                 path_or_handle, columns=columns, **kwargs
-            ).to_pandas()
+            ).to_pandas(**to_pandas_kwargs)
         finally:
             if handles is not None:
                 handles.close()
@@ -258,6 +288,12 @@ def write(
     def read(
         self, path, columns=None, storage_options: StorageOptions = None, **kwargs
     ):
+        use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
+        if use_nullable_dtypes:
+            raise ValueError(
+                "The 'use_nullable_dtypes' argument is not supported for the "
+                "fastparquet engine"
+            )
         path = stringify_path(path)
         parquet_kwargs = {}
         handles = None
@@ -368,7 +404,13 @@ def to_parquet(
         return None
 
 
-def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
+def read_parquet(
+    path,
+    engine: str = "auto",
+    columns=None,
+    use_nullable_dtypes: bool = False,
+    **kwargs,
+):
     """
     Load a parquet object from the file path, returning a DataFrame.
 
@@ -397,6 +439,15 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
         'pyarrow' is unavailable.
     columns : list, default=None
         If not None, only these columns will be read from the file.
+    use_nullable_dtypes : bool, default False
+        If True, use dtypes that use ``pd.NA`` as missing value indicator
+        for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
+        As new dtypes are added that support ``pd.NA`` in the future, the
+        output with this option will change to use those dtypes.
+        Note: this is an experimental option, and behaviour (e.g. additional
+        support dtypes) may change without notice.
+
+        .. versionadded:: 1.2.0
     **kwargs
         Any additional kwargs are passed to the engine.
 
@@ -405,4 +456,6 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
     DataFrame
     """
     impl = get_engine(engine)
-    return impl.read(path, columns=columns, **kwargs)
+    return impl.read(
+        path, columns=columns, use_nullable_dtypes=use_nullable_dtypes, **kwargs
+    )
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -828,6 +828,35 @@ def test_additional_extension_types(self, pa):
         )
         check_round_trip(df, pa)
 
+    @td.skip_if_no("pyarrow", min_version="0.16")
+    def test_use_nullable_dtypes(self, pa):
+        import pyarrow.parquet as pq
+
+        table = pyarrow.table(
+            {
+                "a": pyarrow.array([1, 2, 3, None], "int64"),
+                "b": pyarrow.array([1, 2, 3, None], "uint8"),
+                "c": pyarrow.array(["a", "b", "c", None]),
+                "d": pyarrow.array([True, False, True, None]),
+            }
+        )
+        with tm.ensure_clean() as path:
+            # write manually with pyarrow to write integers
+            pq.write_table(table, path)
+            result1 = read_parquet(path)
+            result2 = read_parquet(path, use_nullable_dtypes=True)
+
+        assert result1["a"].dtype == np.dtype("float64")
+        expected = pd.DataFrame(
+            {
+                "a": pd.array([1, 2, 3, None], dtype="Int64"),
+                "b": pd.array([1, 2, 3, None], dtype="UInt8"),
+                "c": pd.array(["a", "b", "c", None], dtype="string"),
+                "d": pd.array([True, False, True, None], dtype="boolean"),
+            }
+        )
+        tm.assert_frame_equal(result2, expected)
+
     @td.skip_if_no("pyarrow", min_version="0.14")
     def test_timestamp_nanoseconds(self, pa):
         # with version 2.0, pyarrow defaults to writing the nanoseconds, so
@@ -1001,3 +1030,11 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
         expected = df.copy()
         expected.index.name = "index"
         check_round_trip(df, fp, expected=expected)
+
+    def test_use_nullable_dtypes_not_supported(self, fp):
+        df = pd.DataFrame({"a": [1, 2]})
+
+        with tm.ensure_clean() as path:
+            df.to_parquet(path)
+            with pytest.raises(ValueError, match="not supported for the fastparquet"):
+                read_parquet(path, engine="fastparquet", use_nullable_dtypes=True)