From a2e05e6e9ea80094aa3f7179a1afd253135e88c0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 2 Dec 2022 15:12:10 -0800 Subject: [PATCH 1/7] Refactor to mode.nullable_backend --- pandas/core/config_init.py | 26 +++++++++---------- pandas/io/orc.py | 8 +++--- pandas/io/parquet.py | 4 +-- pandas/io/parsers/arrow_parser_wrapper.py | 2 +- pandas/io/parsers/base_parser.py | 2 +- pandas/io/parsers/readers.py | 6 ++--- pandas/tests/io/excel/test_readers.py | 2 +- .../io/parser/dtypes/test_dtypes_basic.py | 2 +- pandas/tests/io/test_orc.py | 6 ++--- pandas/tests/io/test_parquet.py | 2 +- 10 files changed, 29 insertions(+), 31 deletions(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index b101b25a10a80..aad203bb41fbf 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -553,6 +553,12 @@ def use_inf_as_na_cb(key) -> None: The default storage for StringDtype. """ +nullable_backend_doc = """ +: string + The nullable dtype implementation to return. + Available options: 'pandas', 'pyarrow', the default is 'pandas'. +""" + with cf.config_prefix("mode"): cf.register_option( "string_storage", @@ -560,6 +566,12 @@ def use_inf_as_na_cb(key) -> None: string_storage_doc, validator=is_one_of_factory(["python", "pyarrow"]), ) + cf.register_option( + "nullable_backend", + "pandas", + nullable_backend_doc, + validator=is_one_of_factory(["pandas", "pyarrow"]), + ) # Set up the io.excel specific reader configuration. reader_engine_doc = """ @@ -687,20 +699,6 @@ def use_inf_as_na_cb(key) -> None: validator=is_one_of_factory(["auto", "sqlalchemy"]), ) -io_nullable_backend_doc = """ -: string - The nullable dtype implementation to return when ``use_nullable_dtypes=True``. - Available options: 'pandas', 'pyarrow', the default is 'pandas'. -""" - -with cf.config_prefix("io.nullable_backend"): - cf.register_option( - "io_nullable_backend", - "pandas", - io_nullable_backend_doc, - validator=is_one_of_factory(["pandas", "pyarrow"]), - ) - # -------- # Plotting # --------- diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 42a75701cd9c4..869a5fb4c84cd 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -59,7 +59,7 @@ def read_orc( for the resulting DataFrame. The nullable dtype implementation can be configured by setting the global - ``io.nullable_backend`` configuration option to ``"pandas"`` to use + ``mode.nullable_backend`` configuration option to ``"pandas"`` to use numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). @@ -67,7 +67,7 @@ def read_orc( .. note - Currently only ``io.nullable_backend`` set to ``"pyarrow"`` is supported. + Currently only ``mode.nullable_backend`` set to ``"pyarrow"`` is supported. **kwargs Any additional kwargs are passed to pyarrow. @@ -89,10 +89,10 @@ def read_orc( orc_file = orc.ORCFile(handles.handle) pa_table = orc_file.read(columns=columns, **kwargs) if use_nullable_dtypes: - nullable_backend = get_option("io.nullable_backend") + nullable_backend = get_option("mode.nullable_backend") if nullable_backend != "pyarrow": raise NotImplementedError( - f"io.nullable_backend set to {nullable_backend} is not implemented." + f"mode.nullable_backend set to {nullable_backend} is not implemented." ) df = DataFrame( { diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 1c14722227124..ea3c85722f134 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -222,7 +222,7 @@ def read( ) -> DataFrame: kwargs["use_pandas_metadata"] = True - nullable_backend = get_option("io.nullable_backend") + nullable_backend = get_option("mode.nullable_backend") to_pandas_kwargs = {} if use_nullable_dtypes: import pandas as pd @@ -509,7 +509,7 @@ def read_parquet( .. versionadded:: 1.2.0 The nullable dtype implementation can be configured by setting the global - ``io.nullable_backend`` configuration option to ``"pandas"`` to use + ``mode.nullable_backend`` configuration option to ``"pandas"`` to use numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 68158a30f7fdf..3ef53eeca6ee1 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -151,7 +151,7 @@ def read(self) -> DataFrame: ) if ( self.kwds["use_nullable_dtypes"] - and get_option("io.nullable_backend") == "pyarrow" + and get_option("mode.nullable_backend") == "pyarrow" ): frame = DataFrame( { diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index c5fc054952b1f..b1a2f2b8cdc8a 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -714,7 +714,7 @@ def _infer_types( use_nullable_dtypes: Literal[True] | Literal[False] = ( self.use_nullable_dtypes and no_dtype_specified ) - nullable_backend = get_option("io.nullable_backend") + nullable_backend = get_option("mode.nullable_backend") result: ArrayLike if try_num_bool and is_object_dtype(values.dtype): diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index d9c2403a19d0c..e67a0146d21dd 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -396,7 +396,7 @@ implementation, even if no nulls are present. The nullable dtype implementation can be configured by setting the global - ``io.nullable_backend`` configuration option to ``"pandas"`` to use + ``mode.nullable_backend`` configuration option to ``"pandas"`` to use numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). @@ -558,11 +558,11 @@ def _read( ) elif ( kwds.get("use_nullable_dtypes", False) - and get_option("io.nullable_backend") == "pyarrow" + and get_option("mode.nullable_backend") == "pyarrow" ): raise NotImplementedError( f"use_nullable_dtypes=True and engine={kwds['engine']} with " - "io.nullable_backend set to 'pyarrow' is not implemented." + "mode.nullable_backend set to 'pyarrow' is not implemented." ) else: chunksize = validate_integer("chunksize", chunksize, 1) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 822e24b224052..3c13b6f3b469c 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -561,7 +561,7 @@ def test_use_nullable_dtypes(self, read_ext, nullable_backend): ) with tm.ensure_clean(read_ext) as file_path: df.to_excel(file_path, "test", index=False) - with pd.option_context("io.nullable_backend", nullable_backend): + with pd.option_context("mode.nullable_backend", nullable_backend): result = pd.read_excel( file_path, sheet_name="test", use_nullable_dtypes=True ) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 030b38cceeb39..95411310bcc35 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -498,7 +498,7 @@ def test_use_nullable_dtypes_pyarrow_backend(all_parsers, request): 1,2.5,True,a,,,,,12-31-2019, 3,4.5,False,b,6,7.5,True,a,12-31-2019, """ - with pd.option_context("io.nullable_backend", "pyarrow"): + with pd.option_context("mode.nullable_backend", "pyarrow"): if parser.engine != "pyarrow": request.node.add_marker( pytest.mark.xfail( diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index e747c03568603..1b811fc18c7f8 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -309,9 +309,9 @@ def test_orc_use_nullable_dtypes_pandas_backend_not_supported(dirpath): input_file = os.path.join(dirpath, "TestOrcFile.emptyFile.orc") with pytest.raises( NotImplementedError, - match="io.nullable_backend set to pandas is not implemented.", + match="mode.nullable_backend set to pandas is not implemented.", ): - with pd.option_context("io.nullable_backend", "pandas"): + with pd.option_context("mode.nullable_backend", "pandas"): read_orc(input_file, use_nullable_dtypes=True) @@ -337,7 +337,7 @@ def test_orc_use_nullable_dtypes_pyarrow_backend(): } ) bytes_data = df.copy().to_orc() - with pd.option_context("io.nullable_backend", "pyarrow"): + with pd.option_context("mode.nullable_backend", "pyarrow"): result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True) expected = pd.DataFrame( { diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 75683a1d96bfb..e6eacf448aae5 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1037,7 +1037,7 @@ def test_read_use_nullable_types_pyarrow_config(self, pa, df_full): pd.ArrowDtype(pyarrow.timestamp(unit="us", tz="Europe/Brussels")) ) - with pd.option_context("io.nullable_backend", "pyarrow"): + with pd.option_context("mode.nullable_backend", "pyarrow"): check_round_trip( df, engine=pa, From ee39eac55484d08affc14bf5a18b3bc2fd88cdce Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 5 Dec 2022 13:35:29 -0800 Subject: [PATCH 2/7] Update code paths --- pandas/core/dtypes/cast.py | 23 ++++++++++++++++++++++- pandas/core/generic.py | 7 +++++++ pandas/core/series.py | 2 ++ pandas/io/orc.py | 9 +++++---- pandas/io/parquet.py | 9 +++++---- pandas/io/parsers/readers.py | 9 +++++---- 6 files changed, 46 insertions(+), 13 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 36c713cab7123..e97414e37f44b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -9,6 +9,7 @@ from typing import ( TYPE_CHECKING, Any, + Literal, Sized, TypeVar, cast, @@ -71,10 +72,12 @@ pandas_dtype as pandas_dtype_func, ) from pandas.core.dtypes.dtypes import ( + BaseMaskedDtype, CategoricalDtype, DatetimeTZDtype, ExtensionDtype, IntervalDtype, + PandasExtensionDtype, PeriodDtype, ) from pandas.core.dtypes.generic import ( @@ -1007,6 +1010,7 @@ def convert_dtypes( convert_boolean: bool = True, convert_floating: bool = True, infer_objects: bool = False, + nullable_backend: Literal["pandas", "pyarrow"] = "pandas", ) -> DtypeObj: """ Convert objects to best possible type, and optionally, @@ -1028,6 +1032,11 @@ def convert_dtypes( infer_objects : bool, defaults False Whether to also infer objects to float/int if possible. Is only hit if the object array contains pd.NA. + nullable_backend : str, default "pandas" + Nullable dtype implementation to use. + + * "pandas" returns numpy-backed nullable types + * "pyarrow" returns pyarrow-backed nullable types using ``ArrowDtype`` Returns ------- @@ -1111,7 +1120,19 @@ def convert_dtypes( inferred_dtype = input_array.dtype else: - return input_array.dtype + inferred_dtype = input_array.dtype + + if nullable_backend == "pyarrow": + from pandas.core.arrays.arrow.array import to_pyarrow_type + from pandas.core.arrays.arrow.dtype import ArrowDtype + + if isinstance(inferred_dtype, (PandasExtensionDtype, BaseMaskedDtype)): + base_dtype = inferred_dtype.base + else: + base_dtype = inferred_dtype + pa_type = to_pyarrow_type(base_dtype) + if pa_type is not None: + inferred_dtype = ArrowDtype(pa_type) # error: Incompatible return value type (got "Union[str, Union[dtype[Any], # ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]") diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d1e48a3d10a1e..ee7161c841b11 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6428,6 +6428,13 @@ def convert_dtypes( In the future, as new dtypes are added that support ``pd.NA``, the results of this method will change to support those new dtypes. + .. versionadded:: 2.0 + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.nullable_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.nullable_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). + Examples -------- >>> df = pd.DataFrame( diff --git a/pandas/core/series.py b/pandas/core/series.py index 48bc07ca022ee..c5b3eae9435e7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5406,6 +5406,7 @@ def _convert_dtypes( input_series = input_series.copy() if convert_string or convert_integer or convert_boolean or convert_floating: + nullable_backend = get_option("mode.nullable_backend") inferred_dtype = convert_dtypes( input_series._values, convert_string, @@ -5413,6 +5414,7 @@ def _convert_dtypes( convert_boolean, convert_floating, infer_objects, + nullable_backend, ) result = input_series.astype(inferred_dtype) else: diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 869a5fb4c84cd..bb8abc902010e 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -58,10 +58,11 @@ def read_orc( If True, use dtypes that use ``pd.NA`` as missing value indicator for the resulting DataFrame. - The nullable dtype implementation can be configured by setting the global - ``mode.nullable_backend`` configuration option to ``"pandas"`` to use - numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed - nullable dtypes (using ``pd.ArrowDtype``). + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.nullable_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.nullable_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). .. versionadded:: 2.0.0 diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index ea3c85722f134..8767596af3e58 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -508,10 +508,11 @@ def read_parquet( .. versionadded:: 1.2.0 - The nullable dtype implementation can be configured by setting the global - ``mode.nullable_backend`` configuration option to ``"pandas"`` to use - numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed - nullable dtypes (using ``pd.ArrowDtype``). + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.nullable_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.nullable_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). .. versionadded:: 2.0.0 diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index e67a0146d21dd..336af30a01770 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -395,10 +395,11 @@ set to True, nullable dtypes are used for all dtypes that have a nullable implementation, even if no nulls are present. - The nullable dtype implementation can be configured by setting the global - ``mode.nullable_backend`` configuration option to ``"pandas"`` to use - numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed - nullable dtypes (using ``pd.ArrowDtype``). + The nullable dtype implementation can be configured by calling + ``pd.set_option("mode.nullable_backend", "pandas")`` to use + numpy-backed nullable dtypes or + ``pd.set_option("mode.nullable_backend", "pyarrow")`` to use + pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``). .. versionadded:: 2.0 From da6bac130d4d564707841f8286cb07baef4c3dde Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 6 Dec 2022 14:00:35 -0800 Subject: [PATCH 3/7] Add tests and whatsnew --- doc/source/whatsnew/v2.0.0.rst | 20 ++++--- pandas/core/dtypes/cast.py | 11 +++- .../frame/methods/test_convert_dtypes.py | 55 +++++++++++++++++++ 3 files changed, 76 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 7838ef8df4164..35d7e038d84d9 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -30,15 +30,15 @@ sql-other, html, xml, plot, output_formatting, clipboard, compression, test]`` ( .. _whatsnew_200.enhancements.io_use_nullable_dtypes_and_nullable_backend: -Configuration option, ``io.nullable_backend``, to return pyarrow-backed dtypes from IO functions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Configuration option, ``mode.nullable_backend``, to return pyarrow-backed dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The ``use_nullable_dtypes`` keyword argument has been expanded to the following functions to enable automatic conversion to nullable dtypes (:issue:`36712`) * :func:`read_csv` * :func:`read_excel` -Additionally a new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions +Additionally a new global configuration, ``mode.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions to select the nullable dtypes implementation. * :func:`read_csv` (with ``engine="pyarrow"``) @@ -46,8 +46,14 @@ to select the nullable dtypes implementation. * :func:`read_parquet` * :func:`read_orc` -By default, ``io.nullable_backend`` is set to ``"pandas"`` to return existing, numpy-backed nullable dtypes, but it can also -be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` (:issue:`48957`). + +And the following methods will also utilize the ``mode.nullable_backend`` option. + +* :meth:`DataFrame.convert_dtypes` +* :meth:`Series.convert_dtypes` + +By default, ``mode.nullable_backend`` is set to ``"pandas"`` to return existing, numpy-backed nullable dtypes, but it can also +be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` (:issue:`48957`, :issue:`49997`). .. ipython:: python @@ -56,12 +62,12 @@ be set to ``"pyarrow"`` to return pyarrow-backed, nullable :class:`ArrowDtype` ( 1,2.5,True,a,,,,, 3,4.5,False,b,6,7.5,True,a, """) - with pd.option_context("io.nullable_backend", "pandas"): + with pd.option_context("mode.nullable_backend", "pandas"): df = pd.read_csv(data, use_nullable_dtypes=True) df.dtypes data.seek(0) - with pd.option_context("io.nullable_backend", "pyarrow"): + with pd.option_context("mode.nullable_backend", "pyarrow"): df_pyarrow = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow") df_pyarrow.dtypes diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 366c6df747f37..d6d021c108297 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1006,9 +1006,9 @@ def convert_dtypes( if is_string_dtype(inferred_dtype): if not convert_string or inferred_dtype == "bytes": - return input_array.dtype + inferred_dtype = input_array.dtype else: - return pandas_dtype_func("string") + inferred_dtype = pandas_dtype_func("string") if convert_integer: target_int_dtype = pandas_dtype_func("Int64") @@ -1076,9 +1076,14 @@ def convert_dtypes( if nullable_backend == "pyarrow": from pandas.core.arrays.arrow.array import to_pyarrow_type from pandas.core.arrays.arrow.dtype import ArrowDtype + from pandas.core.arrays.string_ import StringDtype - if isinstance(inferred_dtype, (PandasExtensionDtype, BaseMaskedDtype)): + if isinstance(inferred_dtype, PandasExtensionDtype): base_dtype = inferred_dtype.base + elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)): + base_dtype = inferred_dtype.numpy_dtype + elif isinstance(inferred_dtype, StringDtype): + base_dtype = str else: base_dtype = inferred_dtype pa_type = to_pyarrow_type(base_dtype) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index ec639ed7132a4..14bfcd4dad1ca 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -1,3 +1,5 @@ +import datetime + import numpy as np import pytest @@ -41,3 +43,56 @@ def test_convert_dtypes_retain_column_names(self): result = df.convert_dtypes() tm.assert_index_equal(result.columns, df.columns) assert result.columns.name == "cols" + + def test_pyarrow_nullable_backend(self): + pa = pytest.importorskip("pyarrow") + df = pd.DataFrame( + { + "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), + "b": pd.Series(["x", "y", None], dtype=np.dtype("O")), + "c": pd.Series([True, False, None], dtype=np.dtype("O")), + "d": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")), + "e": pd.Series(pd.date_range("2022", periods=3)), + "f": pd.Series(pd.timedelta_range("1D", periods=3)), + } + ) + with pd.option_context("mode.nullable_backend", "pyarrow"): + result = df.convert_dtypes() + expected = pd.DataFrame( + { + "a": pd.arrays.ArrowExtensionArray( + pa.array([1, 2, 3], type=pa.int32()) + ), + "b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])), + "c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])), + "d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])), + "e": pd.arrays.ArrowExtensionArray( + pa.array( + [ + datetime.datetime(2022, 1, 1), + datetime.datetime(2022, 1, 2), + datetime.datetime(2022, 1, 3), + ], + type=pa.timestamp(unit="ns"), + ) + ), + "f": pd.arrays.ArrowExtensionArray( + pa.array( + [ + datetime.timedelta(1), + datetime.timedelta(2), + datetime.timedelta(3), + ], + type=pa.duration("ns"), + ) + ), + } + ) + tm.assert_frame_equal(result, expected) + + def test_pyarrow_nullable_backend_already_pyarrow(self): + pytest.importorskip("pyarrow") + expected = pd.DataFrame([1, 2, 3], dtype="int64[pyarrow]") + with pd.option_context("mode.nullable_backend", "pyarrow"): + result = expected.convert_dtypes() + tm.assert_frame_equal(result, expected) From a083fad2d683f1f7c0d8384d9599d85d710cf63b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 7 Dec 2022 16:46:22 -0800 Subject: [PATCH 4/7] Add unit test converting pandas nullable to pyarrow --- .../frame/methods/test_convert_dtypes.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 14bfcd4dad1ca..01c9a88468655 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -96,3 +96,27 @@ def test_pyarrow_nullable_backend_already_pyarrow(self): with pd.option_context("mode.nullable_backend", "pyarrow"): result = expected.convert_dtypes() tm.assert_frame_equal(result, expected) + + def test_pyarrow_nullable_backend_from_pandas_nullable(self): + pa = pytest.importorskip("pyarrow") + df = pd.DataFrame( + { + "a": pd.Series([1, 2, None], dtype="Int32"), + "b": pd.Series(["x", "y", None], dtype="string[python]"), + "c": pd.Series([True, False, None], dtype="boolean"), + "d": pd.Series([None, 100.5, 200], dtype="Float64"), + } + ) + with pd.option_context("mode.nullable_backend", "pyarrow"): + result = df.convert_dtypes() + expected = pd.DataFrame( + { + "a": pd.arrays.ArrowExtensionArray( + pa.array([1, 2, None], type=pa.int32()) + ), + "b": pd.arrays.ArrowExtensionArray(pa.array(["x", "y", None])), + "c": pd.arrays.ArrowExtensionArray(pa.array([True, False, None])), + "d": pd.arrays.ArrowExtensionArray(pa.array([None, 100.5, 200.0])), + } + ) + tm.assert_frame_equal(result, expected) From 30c5c164b51eaacaa3170aaadfa6a94161c6b998 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 12 Dec 2022 11:45:12 -0800 Subject: [PATCH 5/7] Typing --- pandas/core/dtypes/cast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 5d596c7564605..8108bd45cb058 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1083,7 +1083,7 @@ def convert_dtypes( elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)): base_dtype = inferred_dtype.numpy_dtype elif isinstance(inferred_dtype, StringDtype): - base_dtype = str + base_dtype = np.dtype(str) else: base_dtype = inferred_dtype pa_type = to_pyarrow_type(base_dtype) From a4c409ee27526430ddbbc15e872374e49e95e666 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 13 Dec 2022 15:35:37 -0800 Subject: [PATCH 6/7] Ensure comparison with string --- pandas/core/dtypes/cast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 8108bd45cb058..fff962898cbe1 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1029,7 +1029,7 @@ def convert_dtypes( elif ( infer_objects and is_object_dtype(input_array.dtype) - and inferred_dtype == "integer" + and (isinstance(inferred_dtype, str) and inferred_dtype == "integer") ): inferred_dtype = target_int_dtype From 9d457c35a8cfceef4ae05075f0e39d1ec255f7ef Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 13 Dec 2022 18:14:32 -0800 Subject: [PATCH 7/7] typing and another comparison --- pandas/core/dtypes/cast.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index fff962898cbe1..455257833ec0d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1056,7 +1056,10 @@ def convert_dtypes( elif ( infer_objects and is_object_dtype(input_array.dtype) - and inferred_dtype == "mixed-integer-float" + and ( + isinstance(inferred_dtype, str) + and inferred_dtype == "mixed-integer-float" + ) ): inferred_dtype = pandas_dtype_func("Float64") @@ -1085,7 +1088,10 @@ def convert_dtypes( elif isinstance(inferred_dtype, StringDtype): base_dtype = np.dtype(str) else: - base_dtype = inferred_dtype + # error: Incompatible types in assignment (expression has type + # "Union[str, Any, dtype[Any], ExtensionDtype]", + # variable has type "Union[dtype[Any], ExtensionDtype, None]") + base_dtype = inferred_dtype # type: ignore[assignment] pa_type = to_pyarrow_type(base_dtype) if pa_type is not None: inferred_dtype = ArrowDtype(pa_type)