From e19455d2cb2ce250ad2b69c99f9c0217d9669c9f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 16 Feb 2025 07:54:01 -0500 Subject: [PATCH 1/7] ENH: Add dtype argument to str.decode --- doc/source/whatsnew/v2.3.0.rst | 1 + pandas/core/strings/accessor.py | 16 ++++++++++++++-- pandas/tests/strings/test_strings.py | 21 +++++++++++++++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 8bdddb5b7f85d..ea00d5df7f291 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -39,6 +39,7 @@ Other enhancements - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) - The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) +- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`???`) .. --------------------------------------------------------------------------- .. _whatsnew_230.notable_bug_fixes: diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index b854338c2d1d7..0840bd61fe8d0 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -34,6 +34,7 @@ is_numeric_dtype, is_object_dtype, is_re, + is_string_dtype, ) from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -2102,7 +2103,7 @@ def slice_replace(self, start=None, stop=None, repl=None): result = self._data.array._str_slice_replace(start, stop, repl) return self._wrap_result(result) - def decode(self, encoding, errors: str = "strict"): + def decode(self, encoding, errors: str = "strict", dtype: str | DtypeObj = None): """ Decode character string in the Series/Index using indicated encoding. @@ -2116,6 +2117,10 @@ def decode(self, encoding, errors: str = "strict"): errors : str, optional Specifies the error handling scheme. Possible values are those supported by :meth:`bytes.decode`. + dtype : str or dtype, optional + The dtype of the result. When not ``None``, must be either a string or + object dtype. When ``None``, the dtype of the result is determined by + ``pd.options.future.infer_string``. Returns ------- @@ -2137,6 +2142,12 @@ def decode(self, encoding, errors: str = "strict"): 2 () dtype: object """ + if ( + dtype is not None + and not is_string_dtype(dtype) + and not is_object_dtype(dtype) + ): + raise ValueError(f"dtype must be string or object, got {dtype=}") # TODO: Add a similar _bytes interface. if encoding in _cpython_optimized_decoders: # CPython optimized implementation @@ -2146,7 +2157,8 @@ def decode(self, encoding, errors: str = "strict"): f = lambda x: decoder(x, errors)[0] arr = self._data.array result = arr._str_map(f) - dtype = "str" if get_option("future.infer_string") else None + if dtype is None: + dtype = "str" if get_option("future.infer_string") else None return self._wrap_result(result, dtype=dtype) @forbid_nonstring_types(["bytes"]) diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index ee531b32aa82d..58cccbdcc45ea 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -601,6 +601,27 @@ def test_decode_errors_kwarg(): tm.assert_series_equal(result, expected) +def test_decode_string_dtype(string_dtype): + ser = Series([b"a", b"b"]) + result = ser.str.decode("utf-8", dtype=string_dtype) + expected = Series(["a", "b"], dtype=string_dtype) + tm.assert_series_equal(result, expected) + + +def test_decode_object_dtype(object_dtype): + ser = Series([b"a", rb"\ud800"]) + result = ser.str.decode("utf-8", dtype=object_dtype) + expected = Series(["a", r"\ud800"], dtype=object_dtype) + tm.assert_series_equal(result, expected) + + +def test_decode_bad_dtype(): + ser = Series([b"a", b"b"]) + msg = "dtype must be string or object, got dtype='int64'" + with pytest.raises(ValueError, match=msg): + ser.str.decode("utf-8", dtype="int64") + + @pytest.mark.parametrize( "form, expected", [ From d37469fb38011380576618ecfff3be00b02f0f73 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 16 Feb 2025 07:56:30 -0500 Subject: [PATCH 2/7] Refinements --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/strings/accessor.py | 2 ++ pandas/tests/strings/test_strings.py | 3 +++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index ea00d5df7f291..42beb9080b66f 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -37,9 +37,9 @@ Other enhancements updated to work correctly with NumPy >= 2 (:issue:`57739`) - :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`) - :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`) +- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`) - The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) -- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`???`) .. --------------------------------------------------------------------------- .. _whatsnew_230.notable_bug_fixes: diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 0840bd61fe8d0..94731d47032d4 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2122,6 +2122,8 @@ def decode(self, encoding, errors: str = "strict", dtype: str | DtypeObj = None) object dtype. When ``None``, the dtype of the result is determined by ``pd.options.future.infer_string``. + .. versionadded:: 2.3.0 + Returns ------- Series or Index diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 58cccbdcc45ea..025f837982595 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -602,6 +602,7 @@ def test_decode_errors_kwarg(): def test_decode_string_dtype(string_dtype): + # https://github.com/pandas-dev/pandas/pull/60940 ser = Series([b"a", b"b"]) result = ser.str.decode("utf-8", dtype=string_dtype) expected = Series(["a", "b"], dtype=string_dtype) @@ -609,6 +610,7 @@ def test_decode_string_dtype(string_dtype): def test_decode_object_dtype(object_dtype): + # https://github.com/pandas-dev/pandas/pull/60940 ser = Series([b"a", rb"\ud800"]) result = ser.str.decode("utf-8", dtype=object_dtype) expected = Series(["a", r"\ud800"], dtype=object_dtype) @@ -616,6 +618,7 @@ def test_decode_object_dtype(object_dtype): def test_decode_bad_dtype(): + # https://github.com/pandas-dev/pandas/pull/60940 ser = Series([b"a", b"b"]) msg = "dtype must be string or object, got dtype='int64'" with pytest.raises(ValueError, match=msg): From 797f99c53f4c07f06460614e3e0197b8552d9031 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 16 Feb 2025 07:59:27 -0500 Subject: [PATCH 3/7] cleanup --- pandas/core/strings/accessor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 94731d47032d4..955638984aab9 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2159,8 +2159,8 @@ def decode(self, encoding, errors: str = "strict", dtype: str | DtypeObj = None) f = lambda x: decoder(x, errors)[0] arr = self._data.array result = arr._str_map(f) - if dtype is None: - dtype = "str" if get_option("future.infer_string") else None + if dtype is None and get_option("future.infer_string"): + dtype = "str" return self._wrap_result(result, dtype=dtype) @forbid_nonstring_types(["bytes"]) From 6cd5f0207e3bba0e491c5c5780310d3a33c10de4 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 16 Feb 2025 07:59:55 -0500 Subject: [PATCH 4/7] cleanup --- pandas/core/strings/accessor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 955638984aab9..87ef62e1aeab4 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2150,6 +2150,8 @@ def decode(self, encoding, errors: str = "strict", dtype: str | DtypeObj = None) and not is_object_dtype(dtype) ): raise ValueError(f"dtype must be string or object, got {dtype=}") + if dtype is None and get_option("future.infer_string"): + dtype = "str" # TODO: Add a similar _bytes interface. if encoding in _cpython_optimized_decoders: # CPython optimized implementation @@ -2159,8 +2161,6 @@ def decode(self, encoding, errors: str = "strict", dtype: str | DtypeObj = None) f = lambda x: decoder(x, errors)[0] arr = self._data.array result = arr._str_map(f) - if dtype is None and get_option("future.infer_string"): - dtype = "str" return self._wrap_result(result, dtype=dtype) @forbid_nonstring_types(["bytes"]) From 5a836bb6deb1777488b73f263001c627713b3cfa Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sun, 16 Feb 2025 08:23:00 -0500 Subject: [PATCH 5/7] type-hint fixup --- pandas/core/strings/accessor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 87ef62e1aeab4..6fb7207f45ea8 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2103,7 +2103,9 @@ def slice_replace(self, start=None, stop=None, repl=None): result = self._data.array._str_slice_replace(start, stop, repl) return self._wrap_result(result) - def decode(self, encoding, errors: str = "strict", dtype: str | DtypeObj = None): + def decode( + self, encoding, errors: str = "strict", dtype: str | DtypeObj | None = None + ): """ Decode character string in the Series/Index using indicated encoding. From ee2d377dba883564e2d30a611b5893b663b0a3d2 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 17 Feb 2025 16:18:09 -0500 Subject: [PATCH 6/7] Simplify condition --- pandas/core/strings/accessor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 6fb7207f45ea8..449a492557fa2 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2149,7 +2149,6 @@ def decode( if ( dtype is not None and not is_string_dtype(dtype) - and not is_object_dtype(dtype) ): raise ValueError(f"dtype must be string or object, got {dtype=}") if dtype is None and get_option("future.infer_string"): From 91d6be373598e2c9dde1ed548ad7e35ffdbbec55 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 17 Feb 2025 17:32:24 -0500 Subject: [PATCH 7/7] lint --- pandas/core/strings/accessor.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 449a492557fa2..75fbd642c3520 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -2146,10 +2146,7 @@ def decode( 2 () dtype: object """ - if ( - dtype is not None - and not is_string_dtype(dtype) - ): + if dtype is not None and not is_string_dtype(dtype): raise ValueError(f"dtype must be string or object, got {dtype=}") if dtype is None and get_option("future.infer_string"): dtype = "str"