From f10b95daa5b950a42f6cf8995fd409958d983637 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 11 Nov 2022 14:39:26 -0800
Subject: [PATCH 1/3] API: Series[bytes].astype(str) behavior

---
 pandas/_libs/lib.pyx                       |  5 ++++-
 pandas/core/indexes/base.py                |  8 ++------
 pandas/tests/extension/test_arrow.py       | 24 ++++++++++++++++++++++
 pandas/tests/indexes/object/test_astype.py |  7 +++++++
 pandas/tests/series/methods/test_astype.py |  6 +++++-
 5 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 3769bbf087fee..778eb4e43fba7 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -715,7 +715,10 @@ cpdef ndarray[object] ensure_string_array(
             continue
 
         if not checknull(val):
-            if not util.is_float_object(val):
+            if isinstance(val, bytes):
+                # GH#?? see test_astype_str_from_bytes
+                result[i] = val.decode()
+            elif not util.is_float_object(val):
                 # f"{val}" is faster than str(val)
                 result[i] = f"{val}"
             else:
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 27672c82fdf15..0e693eca6f939 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -1031,12 +1031,8 @@ def astype(self, dtype, copy: bool = True):
             new_values = cls._from_sequence(self, dtype=dtype, copy=copy)
 
         else:
-            if dtype == str:
-                # GH#38607 see test_astype_str_from_bytes
-                new_values = values.astype(dtype, copy=copy)
-            else:
-                # GH#13149 specifically use astype_nansafe instead of astype
-                new_values = astype_nansafe(values, dtype=dtype, copy=copy)
+            # GH#13149 specifically use astype_nansafe instead of astype
+            new_values = astype_nansafe(values, dtype=dtype, copy=copy)
 
         # pass copy=False because any copying will be done in the astype above
         if self._is_backward_compat_public_numeric_index:
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index d094a7731c417..4942e2c408080 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -34,6 +34,7 @@
     pa_version_under9p0,
 )
 from pandas.errors import PerformanceWarning
+import pandas.util._test_decorators as td
 
 import pandas as pd
 import pandas._testing as tm
@@ -234,6 +235,29 @@ def test_astype_str(self, data, request):
             )
         super().test_astype_str(data)
 
+    @pytest.mark.parametrize(
+        "nullable_string_dtype",
+        [
+            "string[python]",
+            pytest.param(
+                "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
+            ),
+        ],
+    )
+    def test_astype_string(self, data, nullable_string_dtype):
+        # with binary dtype
+        pa_dtype = data.dtype.pyarrow_dtype
+        if pa.types.is_binary(pa_dtype):
+            # in this case we end up doing val.decode() instead of str(val)
+            #  so get e.g. "a" instead of "b'a'"
+            result = pd.Series(data[:5]).astype(nullable_string_dtype)
+            expected = pd.Series(
+                [x.decode() for x in data[:5]], dtype=nullable_string_dtype
+            )
+            self.assert_series_equal(result, expected)
+        else:
+            super().test_astype_string(data, nullable_string_dtype)
+
 
 class TestConstructors(base.BaseConstructorsTests):
     def test_from_dtype(self, data, request):
diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py
index 33e45a707df63..de72c84645b4b 100644
--- a/pandas/tests/indexes/object/test_astype.py
+++ b/pandas/tests/indexes/object/test_astype.py
@@ -3,6 +3,7 @@
 from pandas import (
     Index,
     NaT,
+    Series,
 )
 import pandas._testing as tm
 
@@ -14,6 +15,12 @@ def test_astype_str_from_bytes():
     expected = Index(["あ", "a"], dtype="object")
     tm.assert_index_equal(result, expected)
 
+    # while we're here, check that Series.astype behaves the same
+
+    result = Series(idx).astype(str)
+    expected = Series(expected)
+    tm.assert_series_equal(result, expected)
+
 
 def test_astype_invalid_nas_to_tdt64_raises():
     # GH#45722 don't cast np.datetime64 NaTs to timedelta64 NaT
diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py
index 768cc50857e50..2535a83327fe6 100644
--- a/pandas/tests/series/methods/test_astype.py
+++ b/pandas/tests/series/methods/test_astype.py
@@ -378,7 +378,11 @@ def test_astype_unicode(self):
         former_encoding = None
 
         if sys.getdefaultencoding() == "utf-8":
-            test_series.append(Series(["野菜食べないとやばい".encode()]))
+            item = "野菜食べないとやばい"
+            ser = Series([item.encode()])
+            res = ser.astype("unicode")
+            expected = Series([item])
+            tm.assert_series_equal(res, expected)
 
         for ser in test_series:
             res = ser.astype("unicode")

From f95e7121892a51a2031ba9316ff5031ec81de8e7 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Thu, 17 Nov 2022 09:49:41 -0800
Subject: [PATCH 2/3] choose Series behavior

---
 doc/source/whatsnew/v2.0.0.rst             |  1 +
 pandas/_libs/lib.pyx                       |  5 +----
 pandas/tests/extension/test_arrow.py       | 24 ----------------------
 pandas/tests/indexes/object/test_astype.py |  6 ++++--
 pandas/tests/series/methods/test_astype.py |  6 +-----
 5 files changed, 7 insertions(+), 35 deletions(-)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index 4577d20a509ce..268313660aea3 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -343,6 +343,7 @@ Other API changes
 - Passing a sequence containing a type that cannot be converted to :class:`Timedelta` to :func:`to_timedelta` or to the :class:`Series` or :class:`DataFrame` constructor with ``dtype="timedelta64[ns]"`` or to :class:`TimedeltaIndex` now raises ``TypeError`` instead of ``ValueError`` (:issue:`49525`)
 - Changed behavior of :class:`Index` constructor with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`)
 - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`)
+- Changed behavior of :meth:`Index.astype` from object-dtype containing ``bytes`` objects to string dtypes; this now does ``str(val)"`` on bytes objects instead of ``val.decode()``, matching :meth:`Series.astype` behavior (:issue:`45326`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 778eb4e43fba7..3769bbf087fee 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -715,10 +715,7 @@ cpdef ndarray[object] ensure_string_array(
             continue
 
         if not checknull(val):
-            if isinstance(val, bytes):
-                # GH#?? see test_astype_str_from_bytes
-                result[i] = val.decode()
-            elif not util.is_float_object(val):
+            if not util.is_float_object(val):
                 # f"{val}" is faster than str(val)
                 result[i] = f"{val}"
             else:
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index 4942e2c408080..d094a7731c417 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -34,7 +34,6 @@
     pa_version_under9p0,
 )
 from pandas.errors import PerformanceWarning
-import pandas.util._test_decorators as td
 
 import pandas as pd
 import pandas._testing as tm
@@ -235,29 +234,6 @@ def test_astype_str(self, data, request):
             )
         super().test_astype_str(data)
 
-    @pytest.mark.parametrize(
-        "nullable_string_dtype",
-        [
-            "string[python]",
-            pytest.param(
-                "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0")
-            ),
-        ],
-    )
-    def test_astype_string(self, data, nullable_string_dtype):
-        # with binary dtype
-        pa_dtype = data.dtype.pyarrow_dtype
-        if pa.types.is_binary(pa_dtype):
-            # in this case we end up doing val.decode() instead of str(val)
-            #  so get e.g. "a" instead of "b'a'"
-            result = pd.Series(data[:5]).astype(nullable_string_dtype)
-            expected = pd.Series(
-                [x.decode() for x in data[:5]], dtype=nullable_string_dtype
-            )
-            self.assert_series_equal(result, expected)
-        else:
-            super().test_astype_string(data, nullable_string_dtype)
-
 
 class TestConstructors(base.BaseConstructorsTests):
     def test_from_dtype(self, data, request):
diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py
index de72c84645b4b..5e9b94f42c22c 100644
--- a/pandas/tests/indexes/object/test_astype.py
+++ b/pandas/tests/indexes/object/test_astype.py
@@ -10,13 +10,15 @@
 
 def test_astype_str_from_bytes():
     # https://github.com/pandas-dev/pandas/issues/38607
+    # GH#49658 pre-2.0 Index called .values.astype(str) here, which effectively
+    #  did a .decode() on the bytes object.  In 2.0 we go through
+    #  ensure_string_array which does f"{val}"
     idx = Index(["あ", b"a"], dtype="object")
     result = idx.astype(str)
-    expected = Index(["あ", "a"], dtype="object")
+    expected = Index(["あ", "b'a'"], dtype="object")
     tm.assert_index_equal(result, expected)
 
     # while we're here, check that Series.astype behaves the same
-
     result = Series(idx).astype(str)
     expected = Series(expected)
     tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py
index 2535a83327fe6..768cc50857e50 100644
--- a/pandas/tests/series/methods/test_astype.py
+++ b/pandas/tests/series/methods/test_astype.py
@@ -378,11 +378,7 @@ def test_astype_unicode(self):
         former_encoding = None
 
         if sys.getdefaultencoding() == "utf-8":
-            item = "野菜食べないとやばい"
-            ser = Series([item.encode()])
-            res = ser.astype("unicode")
-            expected = Series([item])
-            tm.assert_series_equal(res, expected)
+            test_series.append(Series(["野菜食べないとやばい".encode()]))
 
         for ser in test_series:
             res = ser.astype("unicode")

From e5f68602caddbace5364212dcc7cc51f7fbc9552 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Fri, 10 Feb 2023 17:56:27 -0800
Subject: [PATCH 3/3] use .decode

---
 doc/source/whatsnew/v2.0.0.rst             | 2 +-
 pandas/_libs/lib.pyx                       | 5 ++++-
 pandas/tests/extension/base/casting.py     | 7 +++++--
 pandas/tests/indexes/object/test_astype.py | 2 +-
 pandas/tests/series/methods/test_astype.py | 8 +++++++-
 5 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index e92f9b2f7e366..656f6f13b4e82 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -773,7 +773,7 @@ Other API changes
 - :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`)
 - Changed behavior of :class:`Index`, :class:`Series`, and :class:`DataFrame` arithmetic methods when working with object-dtypes, the results no longer do type inference on the result of the array operations, use ``result.infer_objects(copy=False)`` to do type inference on the result (:issue:`49999`, :issue:`49714`)
 - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`)
-- Changed behavior of :meth:`Index.astype` from object-dtype containing ``bytes`` objects to string dtypes; this now does ``str(val)"`` on bytes objects instead of ``val.decode()``, matching :meth:`Series.astype` behavior (:issue:`45326`)
+- Changed behavior of :meth:`Series.astype` from object-dtype containing ``bytes`` objects to string dtypes; this now does ``val.decode()"`` on bytes objects instead of ``str(val)``, matching :meth:`Index.astype` behavior (:issue:`45326`)
 - Added ``"None"`` to default ``na_values`` in :func:`read_csv` (:issue:`50286`)
 - Changed behavior of :class:`Series` and :class:`DataFrame` constructors when given an integer dtype and floating-point data that is not round numbers, this now raises ``ValueError`` instead of silently retaining the float dtype; do ``Series(data)`` or ``DataFrame(data)`` to get the old behavior, and ``Series(data).astype(dtype)`` or ``DataFrame(data).astype(dtype)`` to get the specified dtype (:issue:`49599`)
 - Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`)
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index d79f7068effc3..04b1266e4df17 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -777,7 +777,10 @@ cpdef ndarray[object] ensure_string_array(
             already_copied = True
 
         if not checknull(val):
-            if not util.is_float_object(val):
+            if isinstance(val, bytes):
+                # GH#49658 discussion of desired behavior here
+                result[i] = val.decode()
+            elif not util.is_float_object(val):
                 # f"{val}" is faster than str(val)
                 result[i] = f"{val}"
             else:
diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py
index 6e1795b150b27..89ea1670d9e7b 100644
--- a/pandas/tests/extension/base/casting.py
+++ b/pandas/tests/extension/base/casting.py
@@ -55,9 +55,12 @@ def test_astype_str(self, data):
         ],
     )
     def test_astype_string(self, data, nullable_string_dtype):
-        # GH-33465
+        # GH-33465, GH#45326 as of 2.0 we decode bytes instead of calling str(obj)
         result = pd.Series(data[:5]).astype(nullable_string_dtype)
-        expected = pd.Series([str(x) for x in data[:5]], dtype=nullable_string_dtype)
+        expected = pd.Series(
+            [str(x) if not isinstance(x, bytes) else x.decode() for x in data[:5]],
+            dtype=nullable_string_dtype,
+        )
         self.assert_series_equal(result, expected)
 
     def test_to_numpy(self, data):
diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py
index 5e9b94f42c22c..273b39b5e319d 100644
--- a/pandas/tests/indexes/object/test_astype.py
+++ b/pandas/tests/indexes/object/test_astype.py
@@ -15,7 +15,7 @@ def test_astype_str_from_bytes():
     #  ensure_string_array which does f"{val}"
     idx = Index(["あ", b"a"], dtype="object")
     result = idx.astype(str)
-    expected = Index(["あ", "b'a'"], dtype="object")
+    expected = Index(["あ", "a"], dtype="object")
     tm.assert_index_equal(result, expected)
 
     # while we're here, check that Series.astype behaves the same
diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py
index ce17614e1f8b7..aae51ebc5a017 100644
--- a/pandas/tests/series/methods/test_astype.py
+++ b/pandas/tests/series/methods/test_astype.py
@@ -389,7 +389,13 @@ def test_astype_unicode(self):
         former_encoding = None
 
         if sys.getdefaultencoding() == "utf-8":
-            test_series.append(Series(["野菜食べないとやばい".encode()]))
+            # GH#45326 as of 2.0 Series.astype matches Index.astype by handling
+            #  bytes with obj.decode() instead of str(obj)
+            item = "野菜食べないとやばい"
+            ser = Series([item.encode()])
+            result = ser.astype("unicode")
+            expected = Series([item])
+            tm.assert_series_equal(result, expected)
 
         for ser in test_series:
             res = ser.astype("unicode")