From 211c8985ea468ec2d313a0809736ed04e74c97d5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 24 Nov 2023 16:48:38 +0100 Subject: [PATCH 1/3] BUG: translate losing object dtype with new string dtype --- doc/source/whatsnew/v2.1.4.rst | 2 +- pandas/core/strings/accessor.py | 19 ++++++++++--------- pandas/tests/strings/test_find_replace.py | 6 +++++- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 543a9864ced26..77ce303dc1bfe 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -25,7 +25,7 @@ Bug fixes - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) -- +- Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) .. --------------------------------------------------------------------------- .. _whatsnew_214.other: diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 58b904fd31b6a..7deedc1c4cbe2 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -259,6 +259,7 @@ def _wrap_result( fill_value=np.nan, returns_string: bool = True, returns_bool: bool = False, + dtype=None, ): from pandas import ( Index, @@ -379,29 +380,29 @@ def cons_row(x): out = out.get_level_values(0) return out else: - return Index(result, name=name) + return Index(result, name=name, dtype=dtype) else: index = self._orig.index # This is a mess. - dtype: DtypeObj | str | None + _dtype: DtypeObj | str | None = dtype vdtype = getattr(result, "dtype", None) if self._is_string: if is_bool_dtype(vdtype): - dtype = result.dtype + _dtype = result.dtype elif returns_string: - dtype = self._orig.dtype + _dtype = self._orig.dtype else: - dtype = vdtype - else: + _dtype = vdtype + elif vdtype is not None: dtype = vdtype if expand: cons = self._orig._constructor_expanddim - result = cons(result, columns=name, index=index, dtype=dtype) + result = cons(result, columns=name, index=index, dtype=_dtype) else: # Must be a Series cons = self._orig._constructor - result = cons(result, name=name, index=index, dtype=dtype) + result = cons(result, name=name, index=index, dtype=_dtype) result = result.__finalize__(self._orig, method="str") if name is not None and result.ndim == 1: # __finalize__ might copy over the original name, but we may @@ -2317,7 +2318,7 @@ def translate(self, table): dtype: object """ result = self._data.array._str_translate(table) - return self._wrap_result(result) + return self._wrap_result(result, dtype=self._data.dtype) @forbid_nonstring_types(["bytes"]) def count(self, pat, flags: int = 0): diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 78f0730d730e8..bd64a5dce3b9a 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -5,6 +5,7 @@ import pytest from pandas.errors import PerformanceWarning +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -893,7 +894,10 @@ def test_find_nan(any_string_dtype): # -------------------------------------------------------------------------------------- -def test_translate(index_or_series, any_string_dtype): +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) +def test_translate(index_or_series, any_string_dtype, infer_string): obj = index_or_series( ["abcdefg", "abcc", "cdddfg", "cdefggg"], dtype=any_string_dtype ) From b500af0fa92a6373f8ed4d592ee4891eb09a2de7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 24 Nov 2023 16:51:40 +0100 Subject: [PATCH 2/3] Fix --- pandas/core/strings/accessor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 7deedc1c4cbe2..c563f2f366da3 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2318,7 +2318,8 @@ def translate(self, table): dtype: object """ result = self._data.array._str_translate(table) - return self._wrap_result(result, dtype=self._data.dtype) + dtype = object if self._data.dtype == "object" else None + return self._wrap_result(result, dtype=dtype) @forbid_nonstring_types(["bytes"]) def count(self, pat, flags: int = 0): From a0f6da2137809591b8e976e9b7fd0dad4e113d19 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 24 Nov 2023 16:54:44 +0100 Subject: [PATCH 3/3] Update accessor.py --- pandas/core/strings/accessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index c563f2f366da3..9fa6e9973291d 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -394,7 +394,7 @@ def cons_row(x): else: _dtype = vdtype elif vdtype is not None: - dtype = vdtype + _dtype = vdtype if expand: cons = self._orig._constructor_expanddim