Skip to content

Commit 7b0d4dd

Browse files
authored
ENH: Allow convert_dtypes to convert to pd.ArrowDtype (#50094)
* Refactor to mode.nullable_backend * Update code paths * Add tests and whatsnew * Add unit test converting pandas nullable to pyarrow * Typing * Ensure comparison with string * typing and another comparison
1 parent e17a7f8 commit 7b0d4dd

File tree

15 files changed

+179
-52
lines changed

15 files changed

+179
-52
lines changed

doc/source/whatsnew/v2.0.0.rst

+13-7
Original file line numberDiff line numberDiff line change
@@ -30,25 +30,31 @@ sql-other, html, xml, plot, output_formatting, clipboard, compression, test]`` (
3030

3131
.. _whatsnew_200.enhancements.io_use_nullable_dtypes_and_nullable_backend:
3232

33-
Configuration option, ``io.nullable_backend``, to return pyarrow-backed dtypes from IO functions
34-
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
33+
Configuration option, ``mode.nullable_backend``, to return pyarrow-backed dtypes
34+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3535

3636
The ``use_nullable_dtypes`` keyword argument has been expanded to the following functions to enable automatic conversion to nullable dtypes (:issue:`36712`)
3737

3838
* :func:`read_csv`
3939
* :func:`read_excel`
4040
* :func:`read_sql`
4141

42-
Additionally a new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
42+
Additionally a new global configuration, ``mode.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
4343
to select the nullable dtypes implementation.
4444

4545
* :func:`read_csv` (with ``engine="pyarrow"``)
4646
* :func:`read_excel`
4747
* :func:`read_parquet`
4848
* :func:`read_orc`
4949

50-
By default, ``io.nullable_backend`` is set to ``"pandas"`` to return existing, numpy-backed nullable dtypes, but it can also
51-
be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` (:issue:`48957`).
50+
51+
And the following methods will also utilize the ``mode.nullable_backend`` option.
52+
53+
* :meth:`DataFrame.convert_dtypes`
54+
* :meth:`Series.convert_dtypes`
55+
56+
By default, ``mode.nullable_backend`` is set to ``"pandas"`` to return existing, numpy-backed nullable dtypes, but it can also
57+
be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` (:issue:`48957`, :issue:`49997`).
5258

5359
.. ipython:: python
5460
@@ -57,12 +63,12 @@ be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` (
5763
1,2.5,True,a,,,,,
5864
3,4.5,False,b,6,7.5,True,a,
5965
""")
60-
with pd.option_context("io.nullable_backend", "pandas"):
66+
with pd.option_context("mode.nullable_backend", "pandas"):
6167
df = pd.read_csv(data, use_nullable_dtypes=True)
6268
df.dtypes
6369
6470
data.seek(0)
65-
with pd.option_context("io.nullable_backend", "pyarrow"):
71+
with pd.option_context("mode.nullable_backend", "pyarrow"):
6672
df_pyarrow = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow")
6773
df_pyarrow.dtypes
6874

pandas/core/config_init.py

+12-14
Original file line numberDiff line numberDiff line change
@@ -539,13 +539,25 @@ def use_inf_as_na_cb(key) -> None:
539539
The default storage for StringDtype.
540540
"""
541541

542+
nullable_backend_doc = """
543+
: string
544+
The nullable dtype implementation to return.
545+
Available options: 'pandas', 'pyarrow', the default is 'pandas'.
546+
"""
547+
542548
with cf.config_prefix("mode"):
543549
cf.register_option(
544550
"string_storage",
545551
"python",
546552
string_storage_doc,
547553
validator=is_one_of_factory(["python", "pyarrow"]),
548554
)
555+
cf.register_option(
556+
"nullable_backend",
557+
"pandas",
558+
nullable_backend_doc,
559+
validator=is_one_of_factory(["pandas", "pyarrow"]),
560+
)
549561

550562
# Set up the io.excel specific reader configuration.
551563
reader_engine_doc = """
@@ -673,20 +685,6 @@ def use_inf_as_na_cb(key) -> None:
673685
validator=is_one_of_factory(["auto", "sqlalchemy"]),
674686
)
675687

676-
io_nullable_backend_doc = """
677-
: string
678-
The nullable dtype implementation to return when ``use_nullable_dtypes=True``.
679-
Available options: 'pandas', 'pyarrow', the default is 'pandas'.
680-
"""
681-
682-
with cf.config_prefix("io.nullable_backend"):
683-
cf.register_option(
684-
"io_nullable_backend",
685-
"pandas",
686-
io_nullable_backend_doc,
687-
validator=is_one_of_factory(["pandas", "pyarrow"]),
688-
)
689-
690688
# --------
691689
# Plotting
692690
# ---------

pandas/core/dtypes/cast.py

+37-5
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from typing import (
1010
TYPE_CHECKING,
1111
Any,
12+
Literal,
1213
Sized,
1314
TypeVar,
1415
cast,
@@ -70,10 +71,12 @@
7071
pandas_dtype as pandas_dtype_func,
7172
)
7273
from pandas.core.dtypes.dtypes import (
74+
BaseMaskedDtype,
7375
CategoricalDtype,
7476
DatetimeTZDtype,
7577
ExtensionDtype,
7678
IntervalDtype,
79+
PandasExtensionDtype,
7780
PeriodDtype,
7881
)
7982
from pandas.core.dtypes.generic import (
@@ -958,6 +961,7 @@ def convert_dtypes(
958961
convert_boolean: bool = True,
959962
convert_floating: bool = True,
960963
infer_objects: bool = False,
964+
nullable_backend: Literal["pandas", "pyarrow"] = "pandas",
961965
) -> DtypeObj:
962966
"""
963967
Convert objects to best possible type, and optionally,
@@ -979,6 +983,11 @@ def convert_dtypes(
979983
infer_objects : bool, defaults False
980984
Whether to also infer objects to float/int if possible. Is only hit if the
981985
object array contains pd.NA.
986+
nullable_backend : str, default "pandas"
987+
Nullable dtype implementation to use.
988+
989+
* "pandas" returns numpy-backed nullable types
990+
* "pyarrow" returns pyarrow-backed nullable types using ``ArrowDtype``
982991
983992
Returns
984993
-------
@@ -997,9 +1006,9 @@ def convert_dtypes(
9971006

9981007
if is_string_dtype(inferred_dtype):
9991008
if not convert_string or inferred_dtype == "bytes":
1000-
return input_array.dtype
1009+
inferred_dtype = input_array.dtype
10011010
else:
1002-
return pandas_dtype_func("string")
1011+
inferred_dtype = pandas_dtype_func("string")
10031012

10041013
if convert_integer:
10051014
target_int_dtype = pandas_dtype_func("Int64")
@@ -1020,7 +1029,7 @@ def convert_dtypes(
10201029
elif (
10211030
infer_objects
10221031
and is_object_dtype(input_array.dtype)
1023-
and inferred_dtype == "integer"
1032+
and (isinstance(inferred_dtype, str) and inferred_dtype == "integer")
10241033
):
10251034
inferred_dtype = target_int_dtype
10261035

@@ -1047,7 +1056,10 @@ def convert_dtypes(
10471056
elif (
10481057
infer_objects
10491058
and is_object_dtype(input_array.dtype)
1050-
and inferred_dtype == "mixed-integer-float"
1059+
and (
1060+
isinstance(inferred_dtype, str)
1061+
and inferred_dtype == "mixed-integer-float"
1062+
)
10511063
):
10521064
inferred_dtype = pandas_dtype_func("Float64")
10531065

@@ -1062,7 +1074,27 @@ def convert_dtypes(
10621074
inferred_dtype = input_array.dtype
10631075

10641076
else:
1065-
return input_array.dtype
1077+
inferred_dtype = input_array.dtype
1078+
1079+
if nullable_backend == "pyarrow":
1080+
from pandas.core.arrays.arrow.array import to_pyarrow_type
1081+
from pandas.core.arrays.arrow.dtype import ArrowDtype
1082+
from pandas.core.arrays.string_ import StringDtype
1083+
1084+
if isinstance(inferred_dtype, PandasExtensionDtype):
1085+
base_dtype = inferred_dtype.base
1086+
elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)):
1087+
base_dtype = inferred_dtype.numpy_dtype
1088+
elif isinstance(inferred_dtype, StringDtype):
1089+
base_dtype = np.dtype(str)
1090+
else:
1091+
# error: Incompatible types in assignment (expression has type
1092+
# "Union[str, Any, dtype[Any], ExtensionDtype]",
1093+
# variable has type "Union[dtype[Any], ExtensionDtype, None]")
1094+
base_dtype = inferred_dtype # type: ignore[assignment]
1095+
pa_type = to_pyarrow_type(base_dtype)
1096+
if pa_type is not None:
1097+
inferred_dtype = ArrowDtype(pa_type)
10661098

10671099
# error: Incompatible return value type (got "Union[str, Union[dtype[Any],
10681100
# ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]")

pandas/core/generic.py

+7
Original file line numberDiff line numberDiff line change
@@ -6433,6 +6433,13 @@ def convert_dtypes(
64336433
In the future, as new dtypes are added that support ``pd.NA``, the results
64346434
of this method will change to support those new dtypes.
64356435
6436+
.. versionadded:: 2.0
6437+
The nullable dtype implementation can be configured by calling
6438+
``pd.set_option("mode.nullable_backend", "pandas")`` to use
6439+
numpy-backed nullable dtypes or
6440+
``pd.set_option("mode.nullable_backend", "pyarrow")`` to use
6441+
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
6442+
64366443
Examples
64376444
--------
64386445
>>> df = pd.DataFrame(

pandas/core/series.py

+2
Original file line numberDiff line numberDiff line change
@@ -5410,13 +5410,15 @@ def _convert_dtypes(
54105410
input_series = input_series.copy()
54115411

54125412
if convert_string or convert_integer or convert_boolean or convert_floating:
5413+
nullable_backend = get_option("mode.nullable_backend")
54135414
inferred_dtype = convert_dtypes(
54145415
input_series._values,
54155416
convert_string,
54165417
convert_integer,
54175418
convert_boolean,
54185419
convert_floating,
54195420
infer_objects,
5421+
nullable_backend,
54205422
)
54215423
result = input_series.astype(inferred_dtype)
54225424
else:

pandas/io/orc.py

+8-7
Original file line numberDiff line numberDiff line change
@@ -58,16 +58,17 @@ def read_orc(
5858
If True, use dtypes that use ``pd.NA`` as missing value indicator
5959
for the resulting DataFrame.
6060
61-
The nullable dtype implementation can be configured by setting the global
62-
``io.nullable_backend`` configuration option to ``"pandas"`` to use
63-
numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed
64-
nullable dtypes (using ``pd.ArrowDtype``).
61+
The nullable dtype implementation can be configured by calling
62+
``pd.set_option("mode.nullable_backend", "pandas")`` to use
63+
numpy-backed nullable dtypes or
64+
``pd.set_option("mode.nullable_backend", "pyarrow")`` to use
65+
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
6566
6667
.. versionadded:: 2.0.0
6768
6869
.. note
6970
70-
Currently only ``io.nullable_backend`` set to ``"pyarrow"`` is supported.
71+
Currently only ``mode.nullable_backend`` set to ``"pyarrow"`` is supported.
7172
7273
**kwargs
7374
Any additional kwargs are passed to pyarrow.
@@ -89,10 +90,10 @@ def read_orc(
8990
orc_file = orc.ORCFile(handles.handle)
9091
pa_table = orc_file.read(columns=columns, **kwargs)
9192
if use_nullable_dtypes:
92-
nullable_backend = get_option("io.nullable_backend")
93+
nullable_backend = get_option("mode.nullable_backend")
9394
if nullable_backend != "pyarrow":
9495
raise NotImplementedError(
95-
f"io.nullable_backend set to {nullable_backend} is not implemented."
96+
f"mode.nullable_backend set to {nullable_backend} is not implemented."
9697
)
9798
df = DataFrame(
9899
{

pandas/io/parquet.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ def read(
222222
) -> DataFrame:
223223
kwargs["use_pandas_metadata"] = True
224224

225-
nullable_backend = get_option("io.nullable_backend")
225+
nullable_backend = get_option("mode.nullable_backend")
226226
to_pandas_kwargs = {}
227227
if use_nullable_dtypes:
228228
import pandas as pd
@@ -508,10 +508,11 @@ def read_parquet(
508508
509509
.. versionadded:: 1.2.0
510510
511-
The nullable dtype implementation can be configured by setting the global
512-
``io.nullable_backend`` configuration option to ``"pandas"`` to use
513-
numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed
514-
nullable dtypes (using ``pd.ArrowDtype``).
511+
The nullable dtype implementation can be configured by calling
512+
``pd.set_option("mode.nullable_backend", "pandas")`` to use
513+
numpy-backed nullable dtypes or
514+
``pd.set_option("mode.nullable_backend", "pyarrow")`` to use
515+
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
515516
516517
.. versionadded:: 2.0.0
517518

pandas/io/parsers/arrow_parser_wrapper.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ def read(self) -> DataFrame:
151151
)
152152
if (
153153
self.kwds["use_nullable_dtypes"]
154-
and get_option("io.nullable_backend") == "pyarrow"
154+
and get_option("mode.nullable_backend") == "pyarrow"
155155
):
156156
frame = DataFrame(
157157
{

pandas/io/parsers/base_parser.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -713,7 +713,7 @@ def _infer_types(
713713
use_nullable_dtypes: Literal[True] | Literal[False] = (
714714
self.use_nullable_dtypes and no_dtype_specified
715715
)
716-
nullable_backend = get_option("io.nullable_backend")
716+
nullable_backend = get_option("mode.nullable_backend")
717717
result: ArrayLike
718718

719719
if try_num_bool and is_object_dtype(values.dtype):

pandas/io/parsers/readers.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -398,10 +398,11 @@
398398
set to True, nullable dtypes are used for all dtypes that have a nullable
399399
implementation, even if no nulls are present.
400400
401-
The nullable dtype implementation can be configured by setting the global
402-
``io.nullable_backend`` configuration option to ``"pandas"`` to use
403-
numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed
404-
nullable dtypes (using ``pd.ArrowDtype``).
401+
The nullable dtype implementation can be configured by calling
402+
``pd.set_option("mode.nullable_backend", "pandas")`` to use
403+
numpy-backed nullable dtypes or
404+
``pd.set_option("mode.nullable_backend", "pyarrow")`` to use
405+
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
405406
406407
.. versionadded:: 2.0
407408
@@ -560,11 +561,11 @@ def _read(
560561
)
561562
elif (
562563
kwds.get("use_nullable_dtypes", False)
563-
and get_option("io.nullable_backend") == "pyarrow"
564+
and get_option("mode.nullable_backend") == "pyarrow"
564565
):
565566
raise NotImplementedError(
566567
f"use_nullable_dtypes=True and engine={kwds['engine']} with "
567-
"io.nullable_backend set to 'pyarrow' is not implemented."
568+
"mode.nullable_backend set to 'pyarrow' is not implemented."
568569
)
569570
else:
570571
chunksize = validate_integer("chunksize", chunksize, 1)

0 commit comments

Comments
 (0)