Skip to content

Commit 0197312

Browse files
authored
API: Series[bytes].astype(str) behavior (#49658)
* API: Series[bytes].astype(str) behavior * choose Series behavior * use .decode
1 parent 6e27efc commit 0197312

File tree

6 files changed

+28
-10
lines changed

6 files changed

+28
-10
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -787,6 +787,7 @@ Other API changes
787787
- :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`)
788788
- Changed behavior of :class:`Index`, :class:`Series`, and :class:`DataFrame` arithmetic methods when working with object-dtypes, the results no longer do type inference on the result of the array operations, use ``result.infer_objects(copy=False)`` to do type inference on the result (:issue:`49999`, :issue:`49714`)
789789
- Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`)
790+
- Changed behavior of :meth:`Series.astype` from object-dtype containing ``bytes`` objects to string dtypes; this now does ``val.decode()"`` on bytes objects instead of ``str(val)``, matching :meth:`Index.astype` behavior (:issue:`45326`)
790791
- Added ``"None"`` to default ``na_values`` in :func:`read_csv` (:issue:`50286`)
791792
- Changed behavior of :class:`Series` and :class:`DataFrame` constructors when given an integer dtype and floating-point data that is not round numbers, this now raises ``ValueError`` instead of silently retaining the float dtype; do ``Series(data)`` or ``DataFrame(data)`` to get the old behavior, and ``Series(data).astype(dtype)`` or ``DataFrame(data).astype(dtype)`` to get the specified dtype (:issue:`49599`)
792793
- Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`)

pandas/_libs/lib.pyx

+4-1
Original file line numberDiff line numberDiff line change
@@ -777,7 +777,10 @@ cpdef ndarray[object] ensure_string_array(
777777
already_copied = True
778778

779779
if not checknull(val):
780-
if not util.is_float_object(val):
780+
if isinstance(val, bytes):
781+
# GH#49658 discussion of desired behavior here
782+
result[i] = val.decode()
783+
elif not util.is_float_object(val):
781784
# f"{val}" is faster than str(val)
782785
result[i] = f"{val}"
783786
else:

pandas/core/indexes/base.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -990,12 +990,8 @@ def astype(self, dtype, copy: bool = True):
990990
new_values = cls._from_sequence(self, dtype=dtype, copy=copy)
991991

992992
else:
993-
if dtype == str:
994-
# GH#38607 see test_astype_str_from_bytes
995-
new_values = values.astype(dtype, copy=copy)
996-
else:
997-
# GH#13149 specifically use astype_array instead of astype
998-
new_values = astype_array(values, dtype=dtype, copy=copy)
993+
# GH#13149 specifically use astype_array instead of astype
994+
new_values = astype_array(values, dtype=dtype, copy=copy)
999995

1000996
# pass copy=False because any copying will be done in the astype above
1001997
return Index(new_values, name=self.name, dtype=new_values.dtype, copy=False)

pandas/tests/extension/base/casting.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,12 @@ def test_astype_str(self, data):
5555
],
5656
)
5757
def test_astype_string(self, data, nullable_string_dtype):
58-
# GH-33465
58+
# GH-33465, GH#45326 as of 2.0 we decode bytes instead of calling str(obj)
5959
result = pd.Series(data[:5]).astype(nullable_string_dtype)
60-
expected = pd.Series([str(x) for x in data[:5]], dtype=nullable_string_dtype)
60+
expected = pd.Series(
61+
[str(x) if not isinstance(x, bytes) else x.decode() for x in data[:5]],
62+
dtype=nullable_string_dtype,
63+
)
6164
self.assert_series_equal(result, expected)
6265

6366
def test_to_numpy(self, data):

pandas/tests/indexes/object/test_astype.py

+9
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,26 @@
33
from pandas import (
44
Index,
55
NaT,
6+
Series,
67
)
78
import pandas._testing as tm
89

910

1011
def test_astype_str_from_bytes():
1112
# https://github.com/pandas-dev/pandas/issues/38607
13+
# GH#49658 pre-2.0 Index called .values.astype(str) here, which effectively
14+
# did a .decode() on the bytes object. In 2.0 we go through
15+
# ensure_string_array which does f"{val}"
1216
idx = Index(["あ", b"a"], dtype="object")
1317
result = idx.astype(str)
1418
expected = Index(["あ", "a"], dtype="object")
1519
tm.assert_index_equal(result, expected)
1620

21+
# while we're here, check that Series.astype behaves the same
22+
result = Series(idx).astype(str)
23+
expected = Series(expected)
24+
tm.assert_series_equal(result, expected)
25+
1726

1827
def test_astype_invalid_nas_to_tdt64_raises():
1928
# GH#45722 don't cast np.datetime64 NaTs to timedelta64 NaT

pandas/tests/series/methods/test_astype.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,13 @@ def test_astype_unicode(self):
389389
former_encoding = None
390390

391391
if sys.getdefaultencoding() == "utf-8":
392-
test_series.append(Series(["野菜食べないとやばい".encode()]))
392+
# GH#45326 as of 2.0 Series.astype matches Index.astype by handling
393+
# bytes with obj.decode() instead of str(obj)
394+
item = "野菜食べないとやばい"
395+
ser = Series([item.encode()])
396+
result = ser.astype("unicode")
397+
expected = Series([item])
398+
tm.assert_series_equal(result, expected)
393399

394400
for ser in test_series:
395401
res = ser.astype("unicode")

0 commit comments

Comments
 (0)