From 705bdefc0e179975579d32bea214f83fe6fcc142 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 23 Apr 2023 11:42:46 +0200 Subject: [PATCH 01/10] BUG: convert_dtypes ingoring convert keywords for pyarrow backend --- doc/source/whatsnew/v2.0.1.rst | 1 + pandas/core/dtypes/cast.py | 40 ++++++++++++------- .../frame/methods/test_convert_dtypes.py | 13 ++++++ 3 files changed, 40 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 7bd1b8f963726..12c92d56dde52 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -38,6 +38,7 @@ Bug fixes - Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`) - Bug in :func:`to_numeric` with ``errors='coerce'`` and ``dtype_backend='pyarrow'`` with :class:`ArrowDtype` data (:issue:`52588`) - Bug in :meth:`ArrowDtype.__from_arrow__` not respecting if dtype is explicitly given (:issue:`52533`) +- Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_foo`` keywords when ``dtype_backeend="pyarrow"`` (:issue:`52871`) - Bug in :meth:`DataFrame.max` and related casting different :class:`Timestamp` resolutions always to nanoseconds (:issue:`52524`) - Bug in :meth:`Series.describe` not returning :class:`ArrowDtype` with ``pyarrow.float64`` type with numeric data (:issue:`52427`) - Bug in :meth:`Series.dt.tz_localize` incorrectly localizing timestamps with :class:`ArrowDtype` (:issue:`52677`) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2d45158cf2a9f..9a2417326846c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1106,20 +1106,32 @@ def convert_dtypes( from pandas.core.arrays.arrow.dtype import ArrowDtype from pandas.core.arrays.string_ import StringDtype - if isinstance(inferred_dtype, PandasExtensionDtype): - base_dtype = inferred_dtype.base - elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)): - base_dtype = inferred_dtype.numpy_dtype - elif isinstance(inferred_dtype, StringDtype): - base_dtype = np.dtype(str) - else: - # error: Incompatible types in assignment (expression has type - # "Union[str, Any, dtype[Any], ExtensionDtype]", - # variable has type "Union[dtype[Any], ExtensionDtype, None]") - base_dtype = inferred_dtype # type: ignore[assignment] - pa_type = to_pyarrow_type(base_dtype) - if pa_type is not None: - inferred_dtype = ArrowDtype(pa_type) + if ( + convert_integer + and inferred_dtype.kind in "iu" + or inferred_dtype.kind in "fc" + and convert_floating + or convert_boolean + and inferred_dtype.kind == "b" + or convert_string + and is_string_dtype(inferred_dtype) + or inferred_dtype.kind not in "iufcb" + and not is_string_dtype(inferred_dtype) + ): + if isinstance(inferred_dtype, PandasExtensionDtype): + base_dtype = inferred_dtype.base + elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)): + base_dtype = inferred_dtype.numpy_dtype + elif isinstance(inferred_dtype, StringDtype): + base_dtype = np.dtype(str) + else: + # error: Incompatible types in assignment (expression has type + # "Union[str, Any, dtype[Any], ExtensionDtype]", + # variable has type "Union[dtype[Any], ExtensionDtype, None]") + base_dtype = inferred_dtype # type: ignore[assignment] + pa_type = to_pyarrow_type(base_dtype) + if pa_type is not None: + inferred_dtype = ArrowDtype(pa_type) # error: Incompatible return value type (got "Union[str, Union[dtype[Any], # ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]") diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 6076933eecec4..62282c32f7345 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -134,3 +134,16 @@ def test_pyarrow_engine_lines_false(self): ) with pytest.raises(ValueError, match=msg): df.convert_dtypes(dtype_backend="numpy") + + def test_pyarrow_backend_no_convesion(self): + # GH#52871 + df = pd.DataFrame({"a": [1, 2], "b": 1.5, "c": True, "d": "x"}) + expected = df.copy() + result = df.convert_dtypes( + convert_floating=False, + convert_integer=False, + convert_boolean=False, + convert_string=False, + dtype_backend="pyarrow", + ) + tm.assert_frame_equal(result, expected) From 8d38e8caf6c5b370aa22e65ef54fd97e76d82915 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 23 Apr 2023 11:44:37 +0200 Subject: [PATCH 02/10] BUG: convert_dtypes ingoring convert keywords for pyarrow backend --- pandas/core/dtypes/cast.py | 3 +-- pandas/tests/frame/methods/test_convert_dtypes.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9a2417326846c..4fb3bba8ee338 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1109,8 +1109,7 @@ def convert_dtypes( if ( convert_integer and inferred_dtype.kind in "iu" - or inferred_dtype.kind in "fc" - and convert_floating + or (convert_floating and inferred_dtype.kind in "fc") or convert_boolean and inferred_dtype.kind == "b" or convert_string diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 62282c32f7345..e38023ddb1657 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -137,6 +137,7 @@ def test_pyarrow_engine_lines_false(self): def test_pyarrow_backend_no_convesion(self): # GH#52871 + pytest.importorskip("pyarrow") df = pd.DataFrame({"a": [1, 2], "b": 1.5, "c": True, "d": "x"}) expected = df.copy() result = df.convert_dtypes( From b690a5abf2a2855f05b8ce3209dc9f3b3036a459 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 23 Apr 2023 11:45:09 +0200 Subject: [PATCH 03/10] BUG: convert_dtypes ingoring convert keywords for pyarrow backend --- pandas/core/dtypes/cast.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 4fb3bba8ee338..c6c643a1b08d8 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1107,15 +1107,14 @@ def convert_dtypes( from pandas.core.arrays.string_ import StringDtype if ( - convert_integer - and inferred_dtype.kind in "iu" + (convert_integer and inferred_dtype.kind in "iu") or (convert_floating and inferred_dtype.kind in "fc") - or convert_boolean - and inferred_dtype.kind == "b" - or convert_string - and is_string_dtype(inferred_dtype) - or inferred_dtype.kind not in "iufcb" - and not is_string_dtype(inferred_dtype) + or (convert_boolean and inferred_dtype.kind == "b") + or (convert_string and is_string_dtype(inferred_dtype)) + or ( + inferred_dtype.kind not in "iufcb" + and not is_string_dtype(inferred_dtype) + ) ): if isinstance(inferred_dtype, PandasExtensionDtype): base_dtype = inferred_dtype.base From 3f7719d4c71b0cbb7a5d8925e6633dc59da9dff1 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 23 Apr 2023 11:49:36 +0200 Subject: [PATCH 04/10] Update v2.0.1.rst --- doc/source/whatsnew/v2.0.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 12c92d56dde52..1925a8c5f642a 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -38,7 +38,7 @@ Bug fixes - Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`) - Bug in :func:`to_numeric` with ``errors='coerce'`` and ``dtype_backend='pyarrow'`` with :class:`ArrowDtype` data (:issue:`52588`) - Bug in :meth:`ArrowDtype.__from_arrow__` not respecting if dtype is explicitly given (:issue:`52533`) -- Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_foo`` keywords when ``dtype_backeend="pyarrow"`` (:issue:`52871`) +- Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_foo`` keywords when ``dtype_backeend="pyarrow"`` (:issue:`52872`) - Bug in :meth:`DataFrame.max` and related casting different :class:`Timestamp` resolutions always to nanoseconds (:issue:`52524`) - Bug in :meth:`Series.describe` not returning :class:`ArrowDtype` with ``pyarrow.float64`` type with numeric data (:issue:`52427`) - Bug in :meth:`Series.dt.tz_localize` incorrectly localizing timestamps with :class:`ArrowDtype` (:issue:`52677`) From d9c4d6608534905e0a48bce126a0b23fab7f35d9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 23 Apr 2023 11:50:01 +0200 Subject: [PATCH 05/10] Update test_convert_dtypes.py --- pandas/tests/frame/methods/test_convert_dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index e38023ddb1657..a749cd11df4f7 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -136,7 +136,7 @@ def test_pyarrow_engine_lines_false(self): df.convert_dtypes(dtype_backend="numpy") def test_pyarrow_backend_no_convesion(self): - # GH#52871 + # GH#52872 pytest.importorskip("pyarrow") df = pd.DataFrame({"a": [1, 2], "b": 1.5, "c": True, "d": "x"}) expected = df.copy() From 5ba81477116516011206e98e083d9c1dd9138f4e Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 24 Apr 2023 20:47:33 +0200 Subject: [PATCH 06/10] Update doc/source/whatsnew/v2.0.1.rst Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- doc/source/whatsnew/v2.0.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 1925a8c5f642a..bf1a981cb0969 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -38,7 +38,7 @@ Bug fixes - Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`) - Bug in :func:`to_numeric` with ``errors='coerce'`` and ``dtype_backend='pyarrow'`` with :class:`ArrowDtype` data (:issue:`52588`) - Bug in :meth:`ArrowDtype.__from_arrow__` not respecting if dtype is explicitly given (:issue:`52533`) -- Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_foo`` keywords when ``dtype_backeend="pyarrow"`` (:issue:`52872`) +- Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) - Bug in :meth:`DataFrame.max` and related casting different :class:`Timestamp` resolutions always to nanoseconds (:issue:`52524`) - Bug in :meth:`Series.describe` not returning :class:`ArrowDtype` with ``pyarrow.float64`` type with numeric data (:issue:`52427`) - Bug in :meth:`Series.dt.tz_localize` incorrectly localizing timestamps with :class:`ArrowDtype` (:issue:`52677`) From e5ec6693c3ac31dd517c1cff3abde4af27b21ebe Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 24 Apr 2023 20:47:53 +0200 Subject: [PATCH 07/10] Update pandas/core/dtypes/cast.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/dtypes/cast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c6c643a1b08d8..77e57c64be72b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1110,7 +1110,7 @@ def convert_dtypes( (convert_integer and inferred_dtype.kind in "iu") or (convert_floating and inferred_dtype.kind in "fc") or (convert_boolean and inferred_dtype.kind == "b") - or (convert_string and is_string_dtype(inferred_dtype)) + or (convert_string and isinstance(inferred_dtype, StringDtype) or ( inferred_dtype.kind not in "iufcb" and not is_string_dtype(inferred_dtype) From 26cc20ebd8248bb81db00043847e995bb73a33c0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 24 Apr 2023 20:49:42 +0200 Subject: [PATCH 08/10] Fix --- doc/source/whatsnew/v2.0.1.rst | 1 - pandas/core/dtypes/cast.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index bf1a981cb0969..7bd1b8f963726 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -38,7 +38,6 @@ Bug fixes - Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`) - Bug in :func:`to_numeric` with ``errors='coerce'`` and ``dtype_backend='pyarrow'`` with :class:`ArrowDtype` data (:issue:`52588`) - Bug in :meth:`ArrowDtype.__from_arrow__` not respecting if dtype is explicitly given (:issue:`52533`) -- Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) - Bug in :meth:`DataFrame.max` and related casting different :class:`Timestamp` resolutions always to nanoseconds (:issue:`52524`) - Bug in :meth:`Series.describe` not returning :class:`ArrowDtype` with ``pyarrow.float64`` type with numeric data (:issue:`52427`) - Bug in :meth:`Series.dt.tz_localize` incorrectly localizing timestamps with :class:`ArrowDtype` (:issue:`52677`) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 77e57c64be72b..001fec77ecdac 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1110,10 +1110,10 @@ def convert_dtypes( (convert_integer and inferred_dtype.kind in "iu") or (convert_floating and inferred_dtype.kind in "fc") or (convert_boolean and inferred_dtype.kind == "b") - or (convert_string and isinstance(inferred_dtype, StringDtype) + or (convert_string and isinstance(inferred_dtype, StringDtype)) or ( inferred_dtype.kind not in "iufcb" - and not is_string_dtype(inferred_dtype) + and not isinstance(inferred_dtype, StringDtype) ) ): if isinstance(inferred_dtype, PandasExtensionDtype): From 67484e8b7bc8f598a264927cb3c399c0a0b259bd Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 24 Apr 2023 20:49:58 +0200 Subject: [PATCH 09/10] Move --- doc/source/whatsnew/v2.0.2.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 0a6738cb9b3dc..c03b250c8ca20 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -20,6 +20,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) - .. --------------------------------------------------------------------------- From 4f3ce3b57833ce503e3bef77f7bd4a495d6abcb0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 25 Apr 2023 21:58:14 +0200 Subject: [PATCH 10/10] Fix mypy --- pandas/core/dtypes/cast.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 001fec77ecdac..fd8c651fe73dc 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1106,6 +1106,8 @@ def convert_dtypes( from pandas.core.arrays.arrow.dtype import ArrowDtype from pandas.core.arrays.string_ import StringDtype + assert not isinstance(inferred_dtype, str) + if ( (convert_integer and inferred_dtype.kind in "iu") or (convert_floating and inferred_dtype.kind in "fc") @@ -1123,10 +1125,7 @@ def convert_dtypes( elif isinstance(inferred_dtype, StringDtype): base_dtype = np.dtype(str) else: - # error: Incompatible types in assignment (expression has type - # "Union[str, Any, dtype[Any], ExtensionDtype]", - # variable has type "Union[dtype[Any], ExtensionDtype, None]") - base_dtype = inferred_dtype # type: ignore[assignment] + base_dtype = inferred_dtype pa_type = to_pyarrow_type(base_dtype) if pa_type is not None: inferred_dtype = ArrowDtype(pa_type)