diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 43a34c8e18b2d..656f6f13b4e82 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -773,6 +773,7 @@ Other API changes - :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`) - Changed behavior of :class:`Index`, :class:`Series`, and :class:`DataFrame` arithmetic methods when working with object-dtypes, the results no longer do type inference on the result of the array operations, use ``result.infer_objects(copy=False)`` to do type inference on the result (:issue:`49999`, :issue:`49714`) - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`) +- Changed behavior of :meth:`Series.astype` from object-dtype containing ``bytes`` objects to string dtypes; this now does ``val.decode()"`` on bytes objects instead of ``str(val)``, matching :meth:`Index.astype` behavior (:issue:`45326`) - Added ``"None"`` to default ``na_values`` in :func:`read_csv` (:issue:`50286`) - Changed behavior of :class:`Series` and :class:`DataFrame` constructors when given an integer dtype and floating-point data that is not round numbers, this now raises ``ValueError`` instead of silently retaining the float dtype; do ``Series(data)`` or ``DataFrame(data)`` to get the old behavior, and ``Series(data).astype(dtype)`` or ``DataFrame(data).astype(dtype)`` to get the specified dtype (:issue:`49599`) - Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d79f7068effc3..04b1266e4df17 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -777,7 +777,10 @@ cpdef ndarray[object] ensure_string_array( already_copied = True if not checknull(val): - if not util.is_float_object(val): + if isinstance(val, bytes): + # GH#49658 discussion of desired behavior here + result[i] = val.decode() + elif not util.is_float_object(val): # f"{val}" is faster than str(val) result[i] = f"{val}" else: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 363bfe76d40fb..75c0168f66126 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -995,12 +995,8 @@ def astype(self, dtype, copy: bool = True): new_values = cls._from_sequence(self, dtype=dtype, copy=copy) else: - if dtype == str: - # GH#38607 see test_astype_str_from_bytes - new_values = values.astype(dtype, copy=copy) - else: - # GH#13149 specifically use astype_array instead of astype - new_values = astype_array(values, dtype=dtype, copy=copy) + # GH#13149 specifically use astype_array instead of astype + new_values = astype_array(values, dtype=dtype, copy=copy) # pass copy=False because any copying will be done in the astype above return Index(new_values, name=self.name, dtype=new_values.dtype, copy=False) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 6e1795b150b27..89ea1670d9e7b 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -55,9 +55,12 @@ def test_astype_str(self, data): ], ) def test_astype_string(self, data, nullable_string_dtype): - # GH-33465 + # GH-33465, GH#45326 as of 2.0 we decode bytes instead of calling str(obj) result = pd.Series(data[:5]).astype(nullable_string_dtype) - expected = pd.Series([str(x) for x in data[:5]], dtype=nullable_string_dtype) + expected = pd.Series( + [str(x) if not isinstance(x, bytes) else x.decode() for x in data[:5]], + dtype=nullable_string_dtype, + ) self.assert_series_equal(result, expected) def test_to_numpy(self, data): diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py index 33e45a707df63..273b39b5e319d 100644 --- a/pandas/tests/indexes/object/test_astype.py +++ b/pandas/tests/indexes/object/test_astype.py @@ -3,17 +3,26 @@ from pandas import ( Index, NaT, + Series, ) import pandas._testing as tm def test_astype_str_from_bytes(): # https://github.com/pandas-dev/pandas/issues/38607 + # GH#49658 pre-2.0 Index called .values.astype(str) here, which effectively + # did a .decode() on the bytes object. In 2.0 we go through + # ensure_string_array which does f"{val}" idx = Index(["あ", b"a"], dtype="object") result = idx.astype(str) expected = Index(["あ", "a"], dtype="object") tm.assert_index_equal(result, expected) + # while we're here, check that Series.astype behaves the same + result = Series(idx).astype(str) + expected = Series(expected) + tm.assert_series_equal(result, expected) + def test_astype_invalid_nas_to_tdt64_raises(): # GH#45722 don't cast np.datetime64 NaTs to timedelta64 NaT diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index ce17614e1f8b7..aae51ebc5a017 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -389,7 +389,13 @@ def test_astype_unicode(self): former_encoding = None if sys.getdefaultencoding() == "utf-8": - test_series.append(Series(["野菜食べないとやばい".encode()])) + # GH#45326 as of 2.0 Series.astype matches Index.astype by handling + # bytes with obj.decode() instead of str(obj) + item = "野菜食べないとやばい" + ser = Series([item.encode()]) + result = ser.astype("unicode") + expected = Series([item]) + tm.assert_series_equal(result, expected) for ser in test_series: res = ser.astype("unicode")