Skip to content

Commit 7b400b3

Browse files
ENH: add use_nullable_dtypes option in read_parquet (pandas-dev#31242)
1 parent 8eca4b7 commit 7b400b3

File tree

3 files changed

+98
-4
lines changed

3 files changed

+98
-4
lines changed

doc/source/whatsnew/v1.2.0.rst

+4
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,10 @@ Other enhancements
241241
- Calling a binary-input NumPy ufunc on multiple ``DataFrame`` objects now aligns, matching the behavior of binary operations and ufuncs on ``Series`` (:issue:`23743`).
242242
- Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`)
243243
- :meth:`DataFrame.to_parquet` now supports :class:`MultiIndex` for columns in parquet format (:issue:`34777`)
244+
- :func:`read_parquet` gained a ``use_nullable_dtypes=True`` option to use
245+
nullable dtypes that use ``pd.NA`` as missing value indicator where possible
246+
for the resulting DataFrame (default is False, and only applicable for
247+
``engine="pyarrow"``) (:issue:`31242`)
244248
- Added :meth:`.Rolling.sem` and :meth:`Expanding.sem` to compute the standard error of the mean (:issue:`26476`)
245249
- :meth:`.Rolling.var` and :meth:`.Rolling.std` use Kahan summation and Welford's Method to avoid numerical issues (:issue:`37051`)
246250
- :meth:`DataFrame.corr` and :meth:`DataFrame.cov` use Welford's Method to avoid numerical issues (:issue:`37448`)

pandas/io/parquet.py

+57-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
""" parquet compat """
22

3+
from distutils.version import LooseVersion
34
import io
45
import os
56
from typing import Any, AnyStr, Dict, List, Optional, Tuple
@@ -177,10 +178,39 @@ def write(
177178
handles.close()
178179

179180
def read(
180-
self, path, columns=None, storage_options: StorageOptions = None, **kwargs
181+
self,
182+
path,
183+
columns=None,
184+
use_nullable_dtypes=False,
185+
storage_options: StorageOptions = None,
186+
**kwargs,
181187
):
182188
kwargs["use_pandas_metadata"] = True
183189

190+
to_pandas_kwargs = {}
191+
if use_nullable_dtypes:
192+
if LooseVersion(self.api.__version__) >= "0.16":
193+
import pandas as pd
194+
195+
mapping = {
196+
self.api.int8(): pd.Int8Dtype(),
197+
self.api.int16(): pd.Int16Dtype(),
198+
self.api.int32(): pd.Int32Dtype(),
199+
self.api.int64(): pd.Int64Dtype(),
200+
self.api.uint8(): pd.UInt8Dtype(),
201+
self.api.uint16(): pd.UInt16Dtype(),
202+
self.api.uint32(): pd.UInt32Dtype(),
203+
self.api.uint64(): pd.UInt64Dtype(),
204+
self.api.bool_(): pd.BooleanDtype(),
205+
self.api.string(): pd.StringDtype(),
206+
}
207+
to_pandas_kwargs["types_mapper"] = mapping.get
208+
else:
209+
raise ValueError(
210+
"'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 "
211+
f"({self.api.__version__} is installed"
212+
)
213+
184214
path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
185215
path,
186216
kwargs.pop("filesystem", None),
@@ -190,7 +220,7 @@ def read(
190220
try:
191221
return self.api.parquet.read_table(
192222
path_or_handle, columns=columns, **kwargs
193-
).to_pandas()
223+
).to_pandas(**to_pandas_kwargs)
194224
finally:
195225
if handles is not None:
196226
handles.close()
@@ -258,6 +288,12 @@ def write(
258288
def read(
259289
self, path, columns=None, storage_options: StorageOptions = None, **kwargs
260290
):
291+
use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
292+
if use_nullable_dtypes:
293+
raise ValueError(
294+
"The 'use_nullable_dtypes' argument is not supported for the "
295+
"fastparquet engine"
296+
)
261297
path = stringify_path(path)
262298
parquet_kwargs = {}
263299
handles = None
@@ -368,7 +404,13 @@ def to_parquet(
368404
return None
369405

370406

371-
def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
407+
def read_parquet(
408+
path,
409+
engine: str = "auto",
410+
columns=None,
411+
use_nullable_dtypes: bool = False,
412+
**kwargs,
413+
):
372414
"""
373415
Load a parquet object from the file path, returning a DataFrame.
374416
@@ -397,6 +439,15 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
397439
'pyarrow' is unavailable.
398440
columns : list, default=None
399441
If not None, only these columns will be read from the file.
442+
use_nullable_dtypes : bool, default False
443+
If True, use dtypes that use ``pd.NA`` as missing value indicator
444+
for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
445+
As new dtypes are added that support ``pd.NA`` in the future, the
446+
output with this option will change to use those dtypes.
447+
Note: this is an experimental option, and behaviour (e.g. additional
448+
support dtypes) may change without notice.
449+
450+
.. versionadded:: 1.2.0
400451
**kwargs
401452
Any additional kwargs are passed to the engine.
402453
@@ -405,4 +456,6 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
405456
DataFrame
406457
"""
407458
impl = get_engine(engine)
408-
return impl.read(path, columns=columns, **kwargs)
459+
return impl.read(
460+
path, columns=columns, use_nullable_dtypes=use_nullable_dtypes, **kwargs
461+
)

pandas/tests/io/test_parquet.py

+37
Original file line numberDiff line numberDiff line change
@@ -828,6 +828,35 @@ def test_additional_extension_types(self, pa):
828828
)
829829
check_round_trip(df, pa)
830830

831+
@td.skip_if_no("pyarrow", min_version="0.16")
832+
def test_use_nullable_dtypes(self, pa):
833+
import pyarrow.parquet as pq
834+
835+
table = pyarrow.table(
836+
{
837+
"a": pyarrow.array([1, 2, 3, None], "int64"),
838+
"b": pyarrow.array([1, 2, 3, None], "uint8"),
839+
"c": pyarrow.array(["a", "b", "c", None]),
840+
"d": pyarrow.array([True, False, True, None]),
841+
}
842+
)
843+
with tm.ensure_clean() as path:
844+
# write manually with pyarrow to write integers
845+
pq.write_table(table, path)
846+
result1 = read_parquet(path)
847+
result2 = read_parquet(path, use_nullable_dtypes=True)
848+
849+
assert result1["a"].dtype == np.dtype("float64")
850+
expected = pd.DataFrame(
851+
{
852+
"a": pd.array([1, 2, 3, None], dtype="Int64"),
853+
"b": pd.array([1, 2, 3, None], dtype="UInt8"),
854+
"c": pd.array(["a", "b", "c", None], dtype="string"),
855+
"d": pd.array([True, False, True, None], dtype="boolean"),
856+
}
857+
)
858+
tm.assert_frame_equal(result2, expected)
859+
831860
@td.skip_if_no("pyarrow", min_version="0.14")
832861
def test_timestamp_nanoseconds(self, pa):
833862
# with version 2.0, pyarrow defaults to writing the nanoseconds, so
@@ -1001,3 +1030,11 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
10011030
expected = df.copy()
10021031
expected.index.name = "index"
10031032
check_round_trip(df, fp, expected=expected)
1033+
1034+
def test_use_nullable_dtypes_not_supported(self, fp):
1035+
df = pd.DataFrame({"a": [1, 2]})
1036+
1037+
with tm.ensure_clean() as path:
1038+
df.to_parquet(path)
1039+
with pytest.raises(ValueError, match="not supported for the fastparquet"):
1040+
read_parquet(path, engine="fastparquet", use_nullable_dtypes=True)

0 commit comments

Comments
 (0)