From 75db35b257e5d164d64df13d511205e8be3132e8 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 27 Aug 2024 08:21:16 -0700 Subject: [PATCH 1/5] API: str.center with pyarrow-backed string dtype --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/_arrow_string_mixins.py | 12 ++++++++++-- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/strings/test_case_justify.py | 6 +----- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index da0d85b7bb529..3b20f55b62b90 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -277,6 +277,7 @@ Other API changes - Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`) - Passing a :class:`Series` input to :func:`json_normalize` will now retain the :class:`Series` :class:`Index`, previously output had a new :class:`RangeIndex` (:issue:`51452`) - Removed :meth:`Index.sort` which always raised a ``TypeError``. This attribute is not defined and will raise an ``AttributeError`` (:issue:`59283`) +- The ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype now matches the python behavior in corner cases with an odd number of fill characters when using pyarrow versions 17.0 and above (:issue:`54792`) - Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`) - pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`) - pickled objects from pandas version less than ``1.0.0`` are no longer supported (:issue:`57155`) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index c810af32f7480..8c6adb006f271 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -1,5 +1,6 @@ from __future__ import annotations +from functools import partial from typing import ( TYPE_CHECKING, Literal, @@ -7,7 +8,10 @@ import numpy as np -from pandas.compat import pa_version_under10p1 +from pandas.compat import ( + pa_version_under10p1, + pa_version_under17p0, +) from pandas.core.dtypes.missing import isna @@ -49,7 +53,11 @@ def _str_pad( elif side == "right": pa_pad = pc.utf8_rpad elif side == "both": - pa_pad = pc.utf8_center + if pa_version_under17p0: + pa_pad = pc.utf8_center + else: + # GH#54792 + pa_pad = partial(pc.utf8_center, lean_left_on_odd_padding=False) else: raise ValueError( f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e8e74b0ba1215..a3169985828e8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -284,6 +284,7 @@ def astype(self, dtype, copy: bool = True): _str_map = BaseStringArray._str_map _str_startswith = ArrowStringArrayMixin._str_startswith _str_endswith = ArrowStringArrayMixin._str_endswith + _str_pad = ArrowStringArrayMixin._str_pad def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True @@ -546,7 +547,6 @@ class ArrowStringArrayNumpySemantics(ArrowStringArray): _str_get = ArrowStringArrayMixin._str_get _str_removesuffix = ArrowStringArrayMixin._str_removesuffix _str_capitalize = ArrowStringArrayMixin._str_capitalize - _str_pad = ArrowStringArrayMixin._str_pad _str_title = ArrowStringArrayMixin._str_title _str_swapcase = ArrowStringArrayMixin._str_swapcase _str_slice_replace = ArrowStringArrayMixin._str_slice_replace diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py index 41aedae90ca76..819556f961fa3 100644 --- a/pandas/tests/strings/test_case_justify.py +++ b/pandas/tests/strings/test_case_justify.py @@ -291,11 +291,7 @@ def test_center_ljust_rjust_mixed_object(): def test_center_ljust_rjust_fillchar(any_string_dtype): - if any_string_dtype == "string[pyarrow_numpy]": - pytest.skip( - "Arrow logic is different, " - "see https://github.com/pandas-dev/pandas/pull/54533/files#r1299808126", - ) + # GH#54533, GH#54792 s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype) result = s.str.center(5, fillchar="X") From d9b18c9f1fd7737460b78a02f2848b03bbb42e0d Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 27 Aug 2024 09:37:15 -0700 Subject: [PATCH 2/5] object fallback --- pandas/core/arrays/_arrow_string_mixins.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 8c6adb006f271..99b191422ad95 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -54,7 +54,8 @@ def _str_pad( pa_pad = pc.utf8_rpad elif side == "both": if pa_version_under17p0: - pa_pad = pc.utf8_center + # GH#59624 fall back to object dtype + return super()._str_pad(width, side, fillchar) else: # GH#54792 pa_pad = partial(pc.utf8_center, lean_left_on_odd_padding=False) From c34b2e6a80554d495f13f2cf0a251f45dc7d21e1 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 27 Aug 2024 10:08:50 -0700 Subject: [PATCH 3/5] API: fall back to object dtype --- pandas/core/arrays/_arrow_string_mixins.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 99b191422ad95..03afa7e6aa81c 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -55,7 +55,12 @@ def _str_pad( elif side == "both": if pa_version_under17p0: # GH#59624 fall back to object dtype - return super()._str_pad(width, side, fillchar) + from pandas import array + + obj_arr = self.astype(object, copy=False) # type: ignore[attr-defined] + obj = array(obj_arr, dtype=object) + result = obj._str_pad(width, side, fillchar) # type: ignore[attr-defined] + return type(self)._from_sequence(result, dtype=self.dtype) # type: ignore[attr-defined] else: # GH#54792 pa_pad = partial(pc.utf8_center, lean_left_on_odd_padding=False) From 99dd31296acb29d84585b10c582729e865795eb0 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 29 Aug 2024 07:23:22 -0700 Subject: [PATCH 4/5] lean_left only on even --- pandas/core/arrays/_arrow_string_mixins.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 03afa7e6aa81c..b5cf0573e70ba 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -63,7 +63,9 @@ def _str_pad( return type(self)._from_sequence(result, dtype=self.dtype) # type: ignore[attr-defined] else: # GH#54792 - pa_pad = partial(pc.utf8_center, lean_left_on_odd_padding=False) + # https://github.com/apache/arrow/issues/15053#issuecomment-2317032347 + lean_left = (width % 2) == 0 + pa_pad = partial(pc.utf8_center, lean_left_on_odd_padding=lean_left) else: raise ValueError( f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" From 1f2902d318be5a6ee62bfd2c988ee95db4a67f67 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 31 Aug 2024 19:01:37 -0700 Subject: [PATCH 5/5] Move whatsnew --- doc/source/whatsnew/v2.3.0.rst | 3 ++- doc/source/whatsnew/v3.0.0.rst | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 8a64aa7c609d6..03355f655eb28 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -103,7 +103,8 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) -- +- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) + Interval ^^^^^^^^ diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3b20f55b62b90..da0d85b7bb529 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -277,7 +277,6 @@ Other API changes - Made ``dtype`` a required argument in :meth:`ExtensionArray._from_sequence_of_strings` (:issue:`56519`) - Passing a :class:`Series` input to :func:`json_normalize` will now retain the :class:`Series` :class:`Index`, previously output had a new :class:`RangeIndex` (:issue:`51452`) - Removed :meth:`Index.sort` which always raised a ``TypeError``. This attribute is not defined and will raise an ``AttributeError`` (:issue:`59283`) -- The ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype now matches the python behavior in corner cases with an odd number of fill characters when using pyarrow versions 17.0 and above (:issue:`54792`) - Updated :meth:`DataFrame.to_excel` so that the output spreadsheet has no styling. Custom styling can still be done using :meth:`Styler.to_excel` (:issue:`54154`) - pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`) - pickled objects from pandas version less than ``1.0.0`` are no longer supported (:issue:`57155`)