diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 75ca21e3e9f72..38ea26c014d4b 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -30,8 +30,8 @@ sql-other, html, xml, plot, output_formatting, clipboard, compression, test]`` ( .. _whatsnew_200.enhancements.io_use_nullable_dtypes_and_nullable_backend: -Configuration option, ``io.nullable_backend``, to return pyarrow-backed dtypes from IO functions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Configuration option, ``mode.nullable_backend``, to return pyarrow-backed dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The ``use_nullable_dtypes`` keyword argument has been expanded to the following functions to enable automatic conversion to nullable dtypes (:issue:`36712`) @@ -39,7 +39,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following * :func:`read_excel` * :func:`read_sql` -Additionally a new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions +Additionally a new global configuration, ``mode.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions to select the nullable dtypes implementation. * :func:`read_csv` (with ``engine="pyarrow"``) @@ -47,8 +47,14 @@ to select the nullable dtypes implementation. * :func:`read_parquet` * :func:`read_orc` -By default, ``io.nullable_backend`` is set to ``"pandas"`` to return existing, numpy-backed nullable dtypes, but it can also -be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` (:issue:`48957`). + +And the following methods will also utilize the ``mode.nullable_backend`` option. + +* :meth:`DataFrame.convert_dtypes` +* :meth:`Series.convert_dtypes` + +By default, ``mode.nullable_backend`` is set to ``"pandas"`` to return existing, numpy-backed nullable dtypes, but it can also +be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` (:issue:`48957`, :issue:`49997`). .. ipython:: python @@ -57,12 +63,12 @@ be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` ( 1,2.5,True,a,,,,, 3,4.5,False,b,6,7.5,True,a, """) - with pd.option_context("io.nullable_backend", "pandas"): + with pd.option_context("mode.nullable_backend", "pandas"): df = pd.read_csv(data, use_nullable_dtypes=True) df.dtypes data.seek(0) - with pd.option_context("io.nullable_backend", "pyarrow"): + with pd.option_context("mode.nullable_backend", "pyarrow"): df_pyarrow = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow") df_pyarrow.dtypes diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index d1a52798360bd..0aca950fe6f3b 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -539,6 +539,12 @@ def use_inf_as_na_cb(key) -> None: The default storage for StringDtype. """ +nullable_backend_doc = """ +: string + The nullable dtype implementation to return. + Available options: 'pandas', 'pyarrow', the default is 'pandas'. +""" + with cf.config_prefix("mode"): cf.register_option( "string_storage", @@ -546,6 +552,12 @@ def use_inf_as_na_cb(key) -> None: string_storage_doc, validator=is_one_of_factory(["python", "pyarrow"]), ) + cf.register_option( + "nullable_backend", + "pandas", + nullable_backend_doc, + validator=is_one_of_factory(["pandas", "pyarrow"]), + ) # Set up the io.excel specific reader configuration. reader_engine_doc = """ @@ -673,20 +685,6 @@ def use_inf_as_na_cb(key) -> None: validator=is_one_of_factory(["auto", "sqlalchemy"]), ) -io_nullable_backend_doc = """ -: string - The nullable dtype implementation to return when ``use_nullable_dtypes=True``. - Available options: 'pandas', 'pyarrow', the default is 'pandas'. -""" - -with cf.config_prefix("io.nullable_backend"): - cf.register_option( - "io_nullable_backend", - "pandas", - io_nullable_backend_doc, - validator=is_one_of_factory(["pandas", "pyarrow"]), - ) - # -------- # Plotting # --------- diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 142a555efd632..455257833ec0d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -9,6 +9,7 @@ from typing import ( TYPE_CHECKING, Any, + Literal, Sized, TypeVar, cast, @@ -70,10 +71,12 @@ pandas_dtype as pandas_dtype_func, ) from pandas.core.dtypes.dtypes import ( + BaseMaskedDtype, CategoricalDtype, DatetimeTZDtype, ExtensionDtype, IntervalDtype, + PandasExtensionDtype, PeriodDtype, ) from pandas.core.dtypes.generic import ( @@ -958,6 +961,7 @@ def convert_dtypes( convert_boolean: bool = True, convert_floating: bool = True, infer_objects: bool = False, + nullable_backend: Literal["pandas", "pyarrow"] = "pandas", ) -> DtypeObj: """ Convert objects to best possible type, and optionally, @@ -979,6 +983,11 @@ def convert_dtypes( infer_objects : bool, defaults False Whether to also infer objects to float/int if possible. Is only hit if the object array contains pd.NA. + nullable_backend : str, default "pandas" + Nullable dtype implementation to use. + + * "pandas" returns numpy-backed nullable types + * "pyarrow" returns pyarrow-backed nullable types using ``ArrowDtype`` Returns ------- @@ -997,9 +1006,9 @@ def convert_dtypes( if is_string_dtype(inferred_dtype): if not convert_string or inferred_dtype == "bytes": - return input_array.dtype + inferred_dtype = input_array.dtype else: - return pandas_dtype_func("string") + inferred_dtype = pandas_dtype_func("string") if convert_integer: target_int_dtype = pandas_dtype_func("Int64") @@ -1020,7 +1029,7 @@ def convert_dtypes( elif ( infer_objects and is_object_dtype(input_array.dtype) - and inferred_dtype == "integer" + and (isinstance(inferred_dtype, str) and inferred_dtype == "integer") ): inferred_dtype = target_int_dtype @@ -1047,7 +1056,10 @@ def convert_dtypes( elif ( infer_objects and is_object_dtype(input_array.dtype) - and inferred_dtype == "mixed-integer-float" + and ( + isinstance(inferred_dtype, str) + and inferred_dtype == "mixed-integer-float" + ) ): inferred_dtype = pandas_dtype_func("Float64") @@ -1062,7 +1074,27 @@ def convert_dtypes( inferred_dtype = input_array.dtype else: - return input_array.dtype + inferred_dtype = input_array.dtype + + if nullable_backend == "pyarrow": + from pandas.core.arrays.arrow.array import to_pyarrow_type + from pandas.core.arrays.arrow.dtype import ArrowDtype + from pandas.core.arrays.string_ import StringDtype + + if isinstance(inferred_dtype, PandasExtensionDtype): + base_dtype = inferred_dtype.base + elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)): + base_dtype = inferred_dtype.numpy_dtype + elif isinstance(inferred_dtype, StringDtype): + base_dtype = np.dtype(str) + else: + # error: Incompatible types in assignment (expression has type + # "Union[str, Any, dtype[Any], ExtensionDtype]", + # variable has type "Union[dtype[Any], ExtensionDtype, None]") + base_dtype = inferred_dtype # type: ignore[assignment] + pa_type = to_pyarrow_type(base_dtype) + if pa_type is not None: + inferred_dtype = ArrowDtype(pa_type) # error: Incompatible return value type (got "Union[str, Union[dtype[Any], # ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]") diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b79f8ec41320d..a4d99cb0eca42 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6433,6 +6433,13 @@ def convert_dtypes( In the future, as new dtypes are added that support ``pd.NA``, the results of this method will change to support those new dtypes. + .. versionadded:: 2.0 + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.nullable_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.nullable_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + Examples -------- >>> df = pd.DataFrame( diff --git a/pandas/core/series.py b/pandas/core/series.py index fd65aa9cf3733..b1758b485bf98 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5410,6 +5410,7 @@ def _convert_dtypes( input_series = input_series.copy() if convert_string or convert_integer or convert_boolean or convert_floating: + nullable_backend = get_option("mode.nullable_backend") inferred_dtype = convert_dtypes( input_series._values, convert_string, @@ -5417,6 +5418,7 @@ def _convert_dtypes( convert_boolean, convert_floating, infer_objects, + nullable_backend, ) result = input_series.astype(inferred_dtype) else: diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 42a75701cd9c4..bb8abc902010e 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -58,16 +58,17 @@ def read_orc( If True, use dtypes that use ``pd.NA`` as missing value indicator for the resulting DataFrame. - The nullable dtype implementation can be configured by setting the global - ``io.nullable_backend`` configuration option to ``"pandas"`` to use - numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed - nullable dtypes (using ``pd.ArrowDtype``). + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.nullable_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.nullable_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). .. versionadded:: 2.0.0 .. note - Currently only ``io.nullable_backend`` set to ``"pyarrow"`` is supported. + Currently only ``mode.nullable_backend`` set to ``"pyarrow"`` is supported. **kwargs Any additional kwargs are passed to pyarrow. @@ -89,10 +90,10 @@ def read_orc( orc_file = orc.ORCFile(handles.handle) pa_table = orc_file.read(columns=columns, **kwargs) if use_nullable_dtypes: - nullable_backend = get_option("io.nullable_backend") + nullable_backend = get_option("mode.nullable_backend") if nullable_backend != "pyarrow": raise NotImplementedError( - f"io.nullable_backend set to {nullable_backend} is not implemented." + f"mode.nullable_backend set to {nullable_backend} is not implemented." ) df = DataFrame( { diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 1c14722227124..8767596af3e58 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -222,7 +222,7 @@ def read( ) -> DataFrame: kwargs["use_pandas_metadata"] = True - nullable_backend = get_option("io.nullable_backend") + nullable_backend = get_option("mode.nullable_backend") to_pandas_kwargs = {} if use_nullable_dtypes: import pandas as pd @@ -508,10 +508,11 @@ def read_parquet( .. versionadded:: 1.2.0 - The nullable dtype implementation can be configured by setting the global - ``io.nullable_backend`` configuration option to ``"pandas"`` to use - numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed - nullable dtypes (using ``pd.ArrowDtype``). + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.nullable_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.nullable_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). .. versionadded:: 2.0.0 diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 68158a30f7fdf..3ef53eeca6ee1 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -151,7 +151,7 @@ def read(self) -> DataFrame: ) if ( self.kwds["use_nullable_dtypes"] - and get_option("io.nullable_backend") == "pyarrow" + and get_option("mode.nullable_backend") == "pyarrow" ): frame = DataFrame( { diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 111a827459022..e6f4830846c77 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -713,7 +713,7 @@ def _infer_types( use_nullable_dtypes: Literal[True] | Literal[False] = ( self.use_nullable_dtypes and no_dtype_specified ) - nullable_backend = get_option("io.nullable_backend") + nullable_backend = get_option("mode.nullable_backend") result: ArrayLike if try_num_bool and is_object_dtype(values.dtype): diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 96c2fd08bbc59..0690ebfae727f 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -398,10 +398,11 @@ set to True, nullable dtypes are used for all dtypes that have a nullable implementation, even if no nulls are present. - The nullable dtype implementation can be configured by setting the global - ``io.nullable_backend`` configuration option to ``"pandas"`` to use - numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed - nullable dtypes (using ``pd.ArrowDtype``). + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.nullable_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.nullable_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). .. versionadded:: 2.0 @@ -560,11 +561,11 @@ def _read( ) elif ( kwds.get("use_nullable_dtypes", False) - and get_option("io.nullable_backend") == "pyarrow" + and get_option("mode.nullable_backend") == "pyarrow" ): raise NotImplementedError( f"use_nullable_dtypes=True and engine={kwds['engine']} with " - "io.nullable_backend set to 'pyarrow' is not implemented." + "mode.nullable_backend set to 'pyarrow' is not implemented." ) else: chunksize = validate_integer("chunksize", chunksize, 1) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index ec639ed7132a4..01c9a88468655 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -1,3 +1,5 @@ +import datetime + import numpy as np import pytest @@ -41,3 +43,80 @@ def test_convert_dtypes_retain_column_names(self): result = df.convert_dtypes() tm.assert_index_equal(result.columns, df.columns) assert result.columns.name == "cols" + + def test_pyarrow_nullable_backend(self): + pa = pytest.importorskip("pyarrow") + df = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), + "b": pd.Series(["x", "y", None], dtype=np.dtype("O")), + "c": pd.Series([True, False, None], dtype=np.dtype("O")), + "d": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")), + "e": pd.Series(pd.date_range("2022", periods=3)), + "f": pd.Series(pd.timedelta_range("1D", periods=3)), + } + ) + with pd.option_context("mode.nullable_backend", "pyarrow"): + result = df.convert_dtypes() + expected = pd.DataFrame( + { + "a": pd.arrays.ArrowExtensionArray( + pa.array([1, 2, 3], type=pa.int32()) + ), + "b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])), + "c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])), + "d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])), + "e": pd.arrays.ArrowExtensionArray( + pa.array( + [ + datetime.datetime(2022, 1, 1), + datetime.datetime(2022, 1, 2), + datetime.datetime(2022, 1, 3), + ], + type=pa.timestamp(unit="ns"), + ) + ), + "f": pd.arrays.ArrowExtensionArray( + pa.array( + [ + datetime.timedelta(1), + datetime.timedelta(2), + datetime.timedelta(3), + ], + type=pa.duration("ns"), + ) + ), + } + ) + tm.assert_frame_equal(result, expected) + + def test_pyarrow_nullable_backend_already_pyarrow(self): + pytest.importorskip("pyarrow") + expected = pd.DataFrame([1, 2, 3], dtype="int64[pyarrow]") + with pd.option_context("mode.nullable_backend", "pyarrow"): + result = expected.convert_dtypes() + tm.assert_frame_equal(result, expected) + + def test_pyarrow_nullable_backend_from_pandas_nullable(self): + pa = pytest.importorskip("pyarrow") + df = pd.DataFrame( + { + "a": pd.Series([1, 2, None], dtype="Int32"), + "b": pd.Series(["x", "y", None], dtype="string[python]"), + "c": pd.Series([True, False, None], dtype="boolean"), + "d": pd.Series([None, 100.5, 200], dtype="Float64"), + } + ) + with pd.option_context("mode.nullable_backend", "pyarrow"): + result = df.convert_dtypes() + expected = pd.DataFrame( + { + "a": pd.arrays.ArrowExtensionArray( + pa.array([1, 2, None], type=pa.int32()) + ), + "b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])), + "c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])), + "d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])), + } + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index a204132963c94..3e879b72a8dcf 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -561,7 +561,7 @@ def test_use_nullable_dtypes(self, read_ext, nullable_backend): ) with tm.ensure_clean(read_ext) as file_path: df.to_excel(file_path, "test", index=False) - with pd.option_context("io.nullable_backend", nullable_backend): + with pd.option_context("mode.nullable_backend", nullable_backend): result = pd.read_excel( file_path, sheet_name="test", use_nullable_dtypes=True ) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 030b38cceeb39..95411310bcc35 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -498,7 +498,7 @@ def test_use_nullable_dtypes_pyarrow_backend(all_parsers, request): 1,2.5,True,a,,,,,12-31-2019, 3,4.5,False,b,6,7.5,True,a,12-31-2019, """ - with pd.option_context("io.nullable_backend", "pyarrow"): + with pd.option_context("mode.nullable_backend", "pyarrow"): if parser.engine != "pyarrow": request.node.add_marker( pytest.mark.xfail( diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index e747c03568603..1b811fc18c7f8 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -309,9 +309,9 @@ def test_orc_use_nullable_dtypes_pandas_backend_not_supported(dirpath): input_file = os.path.join(dirpath, "TestOrcFile.emptyFile.orc") with pytest.raises( NotImplementedError, - match="io.nullable_backend set to pandas is not implemented.", + match="mode.nullable_backend set to pandas is not implemented.", ): - with pd.option_context("io.nullable_backend", "pandas"): + with pd.option_context("mode.nullable_backend", "pandas"): read_orc(input_file, use_nullable_dtypes=True) @@ -337,7 +337,7 @@ def test_orc_use_nullable_dtypes_pyarrow_backend(): } ) bytes_data = df.copy().to_orc() - with pd.option_context("io.nullable_backend", "pyarrow"): + with pd.option_context("mode.nullable_backend", "pyarrow"): result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True) expected = pd.DataFrame( { diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 8694abc498fe5..a609d0774757e 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1037,7 +1037,7 @@ def test_read_use_nullable_types_pyarrow_config(self, pa, df_full): pd.ArrowDtype(pyarrow.timestamp(unit="us", tz="Europe/Brussels")) ) - with pd.option_context("io.nullable_backend", "pyarrow"): + with pd.option_context("mode.nullable_backend", "pyarrow"): check_round_trip( df, engine=pa,