API: Series[bytes].astype(str) behavior (#49658)

jbrockmendel · web-flow · commit 019731296e8b · 2023-02-15T10:08:22.000-08:00
* API: Series[bytes].astype(str) behavior

* choose Series behavior

* use .decode
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -787,6 +787,7 @@ Other API changes
 - :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`)
 - Changed behavior of :class:`Index`, :class:`Series`, and :class:`DataFrame` arithmetic methods when working with object-dtypes, the results no longer do type inference on the result of the array operations, use ``result.infer_objects(copy=False)`` to do type inference on the result (:issue:`49999`, :issue:`49714`)
 - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`)
+- Changed behavior of :meth:`Series.astype` from object-dtype containing ``bytes`` objects to string dtypes; this now does ``val.decode()"`` on bytes objects instead of ``str(val)``, matching :meth:`Index.astype` behavior (:issue:`45326`)
 - Added ``"None"`` to default ``na_values`` in :func:`read_csv` (:issue:`50286`)
 - Changed behavior of :class:`Series` and :class:`DataFrame` constructors when given an integer dtype and floating-point data that is not round numbers, this now raises ``ValueError`` instead of silently retaining the float dtype; do ``Series(data)`` or ``DataFrame(data)`` to get the old behavior, and ``Series(data).astype(dtype)`` or ``DataFrame(data).astype(dtype)`` to get the specified dtype (:issue:`49599`)
 - Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`)
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -777,7 +777,10 @@ cpdef ndarray[object] ensure_string_array(
             already_copied = True
 
         if not checknull(val):
-            if not util.is_float_object(val):
+            if isinstance(val, bytes):
+                # GH#49658 discussion of desired behavior here
+                result[i] = val.decode()
+            elif not util.is_float_object(val):
                 # f"{val}" is faster than str(val)
                 result[i] = f"{val}"
             else:
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -990,12 +990,8 @@ def astype(self, dtype, copy: bool = True):
             new_values = cls._from_sequence(self, dtype=dtype, copy=copy)
 
         else:
-            if dtype == str:
-                # GH#38607 see test_astype_str_from_bytes
-                new_values = values.astype(dtype, copy=copy)
-            else:
-                # GH#13149 specifically use astype_array instead of astype
-                new_values = astype_array(values, dtype=dtype, copy=copy)
+            # GH#13149 specifically use astype_array instead of astype
+            new_values = astype_array(values, dtype=dtype, copy=copy)
 
         # pass copy=False because any copying will be done in the astype above
         return Index(new_values, name=self.name, dtype=new_values.dtype, copy=False)
diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py
@@ -55,9 +55,12 @@ def test_astype_str(self, data):
         ],
     )
     def test_astype_string(self, data, nullable_string_dtype):
-        # GH-33465
+        # GH-33465, GH#45326 as of 2.0 we decode bytes instead of calling str(obj)
         result = pd.Series(data[:5]).astype(nullable_string_dtype)
-        expected = pd.Series([str(x) for x in data[:5]], dtype=nullable_string_dtype)
+        expected = pd.Series(
+            [str(x) if not isinstance(x, bytes) else x.decode() for x in data[:5]],
+            dtype=nullable_string_dtype,
+        )
         self.assert_series_equal(result, expected)
 
     def test_to_numpy(self, data):
diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py
@@ -3,17 +3,26 @@
 from pandas import (
     Index,
     NaT,
+    Series,
 )
 import pandas._testing as tm
 
 
 def test_astype_str_from_bytes():
     # https://github.com/pandas-dev/pandas/issues/38607
+    # GH#49658 pre-2.0 Index called .values.astype(str) here, which effectively
+    #  did a .decode() on the bytes object.  In 2.0 we go through
+    #  ensure_string_array which does f"{val}"
     idx = Index(["あ", b"a"], dtype="object")
     result = idx.astype(str)
     expected = Index(["あ", "a"], dtype="object")
     tm.assert_index_equal(result, expected)
 
+    # while we're here, check that Series.astype behaves the same
+    result = Series(idx).astype(str)
+    expected = Series(expected)
+    tm.assert_series_equal(result, expected)
+
 
 def test_astype_invalid_nas_to_tdt64_raises():
     # GH#45722 don't cast np.datetime64 NaTs to timedelta64 NaT
diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py
@@ -389,7 +389,13 @@ def test_astype_unicode(self):
         former_encoding = None
 
         if sys.getdefaultencoding() == "utf-8":
-            test_series.append(Series(["野菜食べないとやばい".encode()]))
+            # GH#45326 as of 2.0 Series.astype matches Index.astype by handling
+            #  bytes with obj.decode() instead of str(obj)
+            item = "野菜食べないとやばい"
+            ser = Series([item.encode()])
+            result = ser.astype("unicode")
+            expected = Series([item])
+            tm.assert_series_equal(result, expected)
 
         for ser in test_series:
             res = ser.astype("unicode")