Skip to content

ENH: Allow convert_dtypes to convert to pd.ArrowDtype #50094

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Dec 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,25 +30,31 @@ sql-other, html, xml, plot, output_formatting, clipboard, compression, test]`` (

.. _whatsnew_200.enhancements.io_use_nullable_dtypes_and_nullable_backend:

Configuration option, ``io.nullable_backend``, to return pyarrow-backed dtypes from IO functions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Configuration option, ``mode.nullable_backend``, to return pyarrow-backed dtypes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The ``use_nullable_dtypes`` keyword argument has been expanded to the following functions to enable automatic conversion to nullable dtypes (:issue:`36712`)

* :func:`read_csv`
* :func:`read_excel`
* :func:`read_sql`

Additionally a new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
Additionally a new global configuration, ``mode.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
to select the nullable dtypes implementation.

* :func:`read_csv` (with ``engine="pyarrow"``)
* :func:`read_excel`
* :func:`read_parquet`
* :func:`read_orc`

By default, ``io.nullable_backend`` is set to ``"pandas"`` to return existing, numpy-backed nullable dtypes, but it can also
be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` (:issue:`48957`).

And the following methods will also utilize the ``mode.nullable_backend`` option.

* :meth:`DataFrame.convert_dtypes`
* :meth:`Series.convert_dtypes`

By default, ``mode.nullable_backend`` is set to ``"pandas"`` to return existing, numpy-backed nullable dtypes, but it can also
be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` (:issue:`48957`, :issue:`49997`).

.. ipython:: python

Expand All @@ -57,12 +63,12 @@ be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` (
1,2.5,True,a,,,,,
3,4.5,False,b,6,7.5,True,a,
""")
with pd.option_context("io.nullable_backend", "pandas"):
with pd.option_context("mode.nullable_backend", "pandas"):
df = pd.read_csv(data, use_nullable_dtypes=True)
df.dtypes

data.seek(0)
with pd.option_context("io.nullable_backend", "pyarrow"):
with pd.option_context("mode.nullable_backend", "pyarrow"):
df_pyarrow = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow")
df_pyarrow.dtypes

Expand Down
26 changes: 12 additions & 14 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,13 +539,25 @@ def use_inf_as_na_cb(key) -> None:
The default storage for StringDtype.
"""

nullable_backend_doc = """
: string
The nullable dtype implementation to return.
Available options: 'pandas', 'pyarrow', the default is 'pandas'.
"""

with cf.config_prefix("mode"):
cf.register_option(
"string_storage",
"python",
string_storage_doc,
validator=is_one_of_factory(["python", "pyarrow"]),
)
cf.register_option(
"nullable_backend",
"pandas",
nullable_backend_doc,
validator=is_one_of_factory(["pandas", "pyarrow"]),
)

# Set up the io.excel specific reader configuration.
reader_engine_doc = """
Expand Down Expand Up @@ -673,20 +685,6 @@ def use_inf_as_na_cb(key) -> None:
validator=is_one_of_factory(["auto", "sqlalchemy"]),
)

io_nullable_backend_doc = """
: string
The nullable dtype implementation to return when ``use_nullable_dtypes=True``.
Available options: 'pandas', 'pyarrow', the default is 'pandas'.
"""

with cf.config_prefix("io.nullable_backend"):
cf.register_option(
"io_nullable_backend",
"pandas",
io_nullable_backend_doc,
validator=is_one_of_factory(["pandas", "pyarrow"]),
)

# --------
# Plotting
# ---------
Expand Down
42 changes: 37 additions & 5 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from typing import (
TYPE_CHECKING,
Any,
Literal,
Sized,
TypeVar,
cast,
Expand Down Expand Up @@ -70,10 +71,12 @@
pandas_dtype as pandas_dtype_func,
)
from pandas.core.dtypes.dtypes import (
BaseMaskedDtype,
CategoricalDtype,
DatetimeTZDtype,
ExtensionDtype,
IntervalDtype,
PandasExtensionDtype,
PeriodDtype,
)
from pandas.core.dtypes.generic import (
Expand Down Expand Up @@ -958,6 +961,7 @@ def convert_dtypes(
convert_boolean: bool = True,
convert_floating: bool = True,
infer_objects: bool = False,
nullable_backend: Literal["pandas", "pyarrow"] = "pandas",
) -> DtypeObj:
"""
Convert objects to best possible type, and optionally,
Expand All @@ -979,6 +983,11 @@ def convert_dtypes(
infer_objects : bool, defaults False
Whether to also infer objects to float/int if possible. Is only hit if the
object array contains pd.NA.
nullable_backend : str, default "pandas"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does this interact with the mode.nullable_backend option? My guess is folks would expect nullable_backend to take precedent if it's specified, but for mode.nullable_backend to take precedent if nullable_backend isn't given.

Here's a tiny example of the use case where I'm asking what happens:

with pd.option_context("mode.nullable_backend", "pyarrow"):
    result = expected.convert_dtypes(nullable_backend="pandas")

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line specifically is a private helper function that backs the public Series/DataFrame.convert_dtypes.

The public Series/DataFrame.convert_dtypes will not accept a keyword argument nullable_backend

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see -- thanks for clarifying. So the nullable_backend config option is how public APIs toggle here. Good to know.

Nullable dtype implementation to use.

* "pandas" returns numpy-backed nullable types
* "pyarrow" returns pyarrow-backed nullable types using ``ArrowDtype``

Returns
-------
Expand All @@ -997,9 +1006,9 @@ def convert_dtypes(

if is_string_dtype(inferred_dtype):
if not convert_string or inferred_dtype == "bytes":
return input_array.dtype
inferred_dtype = input_array.dtype
else:
return pandas_dtype_func("string")
inferred_dtype = pandas_dtype_func("string")

if convert_integer:
target_int_dtype = pandas_dtype_func("Int64")
Expand All @@ -1020,7 +1029,7 @@ def convert_dtypes(
elif (
infer_objects
and is_object_dtype(input_array.dtype)
and inferred_dtype == "integer"
and (isinstance(inferred_dtype, str) and inferred_dtype == "integer")
):
inferred_dtype = target_int_dtype

Expand All @@ -1047,7 +1056,10 @@ def convert_dtypes(
elif (
infer_objects
and is_object_dtype(input_array.dtype)
and inferred_dtype == "mixed-integer-float"
and (
isinstance(inferred_dtype, str)
and inferred_dtype == "mixed-integer-float"
)
):
inferred_dtype = pandas_dtype_func("Float64")

Expand All @@ -1062,7 +1074,27 @@ def convert_dtypes(
inferred_dtype = input_array.dtype

else:
return input_array.dtype
inferred_dtype = input_array.dtype

if nullable_backend == "pyarrow":
from pandas.core.arrays.arrow.array import to_pyarrow_type
from pandas.core.arrays.arrow.dtype import ArrowDtype
from pandas.core.arrays.string_ import StringDtype

if isinstance(inferred_dtype, PandasExtensionDtype):
base_dtype = inferred_dtype.base
elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)):
base_dtype = inferred_dtype.numpy_dtype
elif isinstance(inferred_dtype, StringDtype):
base_dtype = np.dtype(str)
else:
# error: Incompatible types in assignment (expression has type
# "Union[str, Any, dtype[Any], ExtensionDtype]",
# variable has type "Union[dtype[Any], ExtensionDtype, None]")
base_dtype = inferred_dtype # type: ignore[assignment]
pa_type = to_pyarrow_type(base_dtype)
if pa_type is not None:
inferred_dtype = ArrowDtype(pa_type)

# error: Incompatible return value type (got "Union[str, Union[dtype[Any],
# ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]")
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6433,6 +6433,13 @@ def convert_dtypes(
In the future, as new dtypes are added that support ``pd.NA``, the results
of this method will change to support those new dtypes.

.. versionadded:: 2.0
The nullable dtype implementation can be configured by calling
``pd.set_option("mode.nullable_backend", "pandas")`` to use
numpy-backed nullable dtypes or
``pd.set_option("mode.nullable_backend", "pyarrow")`` to use
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).

Examples
--------
>>> df = pd.DataFrame(
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -5410,13 +5410,15 @@ def _convert_dtypes(
input_series = input_series.copy()

if convert_string or convert_integer or convert_boolean or convert_floating:
nullable_backend = get_option("mode.nullable_backend")
inferred_dtype = convert_dtypes(
input_series._values,
convert_string,
convert_integer,
convert_boolean,
convert_floating,
infer_objects,
nullable_backend,
)
result = input_series.astype(inferred_dtype)
else:
Expand Down
15 changes: 8 additions & 7 deletions pandas/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,16 +58,17 @@ def read_orc(
If True, use dtypes that use ``pd.NA`` as missing value indicator
for the resulting DataFrame.

The nullable dtype implementation can be configured by setting the global
``io.nullable_backend`` configuration option to ``"pandas"`` to use
numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed
nullable dtypes (using ``pd.ArrowDtype``).
The nullable dtype implementation can be configured by calling
``pd.set_option("mode.nullable_backend", "pandas")`` to use
numpy-backed nullable dtypes or
``pd.set_option("mode.nullable_backend", "pyarrow")`` to use
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).

.. versionadded:: 2.0.0

.. note

Currently only ``io.nullable_backend`` set to ``"pyarrow"`` is supported.
Currently only ``mode.nullable_backend`` set to ``"pyarrow"`` is supported.

**kwargs
Any additional kwargs are passed to pyarrow.
Expand All @@ -89,10 +90,10 @@ def read_orc(
orc_file = orc.ORCFile(handles.handle)
pa_table = orc_file.read(columns=columns, **kwargs)
if use_nullable_dtypes:
nullable_backend = get_option("io.nullable_backend")
nullable_backend = get_option("mode.nullable_backend")
if nullable_backend != "pyarrow":
raise NotImplementedError(
f"io.nullable_backend set to {nullable_backend} is not implemented."
f"mode.nullable_backend set to {nullable_backend} is not implemented."
)
df = DataFrame(
{
Expand Down
11 changes: 6 additions & 5 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def read(
) -> DataFrame:
kwargs["use_pandas_metadata"] = True

nullable_backend = get_option("io.nullable_backend")
nullable_backend = get_option("mode.nullable_backend")
to_pandas_kwargs = {}
if use_nullable_dtypes:
import pandas as pd
Expand Down Expand Up @@ -508,10 +508,11 @@ def read_parquet(

.. versionadded:: 1.2.0

The nullable dtype implementation can be configured by setting the global
``io.nullable_backend`` configuration option to ``"pandas"`` to use
numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed
nullable dtypes (using ``pd.ArrowDtype``).
The nullable dtype implementation can be configured by calling
``pd.set_option("mode.nullable_backend", "pandas")`` to use
numpy-backed nullable dtypes or
``pd.set_option("mode.nullable_backend", "pyarrow")`` to use
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).

.. versionadded:: 2.0.0

Expand Down
2 changes: 1 addition & 1 deletion pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def read(self) -> DataFrame:
)
if (
self.kwds["use_nullable_dtypes"]
and get_option("io.nullable_backend") == "pyarrow"
and get_option("mode.nullable_backend") == "pyarrow"
):
frame = DataFrame(
{
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,7 +713,7 @@ def _infer_types(
use_nullable_dtypes: Literal[True] | Literal[False] = (
self.use_nullable_dtypes and no_dtype_specified
)
nullable_backend = get_option("io.nullable_backend")
nullable_backend = get_option("mode.nullable_backend")
result: ArrayLike

if try_num_bool and is_object_dtype(values.dtype):
Expand Down
13 changes: 7 additions & 6 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,10 +398,11 @@
set to True, nullable dtypes are used for all dtypes that have a nullable
implementation, even if no nulls are present.

The nullable dtype implementation can be configured by setting the global
``io.nullable_backend`` configuration option to ``"pandas"`` to use
numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed
nullable dtypes (using ``pd.ArrowDtype``).
The nullable dtype implementation can be configured by calling
``pd.set_option("mode.nullable_backend", "pandas")`` to use
numpy-backed nullable dtypes or
``pd.set_option("mode.nullable_backend", "pyarrow")`` to use
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).

.. versionadded:: 2.0

Expand Down Expand Up @@ -560,11 +561,11 @@ def _read(
)
elif (
kwds.get("use_nullable_dtypes", False)
and get_option("io.nullable_backend") == "pyarrow"
and get_option("mode.nullable_backend") == "pyarrow"
):
raise NotImplementedError(
f"use_nullable_dtypes=True and engine={kwds['engine']} with "
"io.nullable_backend set to 'pyarrow' is not implemented."
"mode.nullable_backend set to 'pyarrow' is not implemented."
)
else:
chunksize = validate_integer("chunksize", chunksize, 1)
Expand Down
Loading