From f10b95daa5b950a42f6cf8995fd409958d983637 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 11 Nov 2022 14:39:26 -0800 Subject: [PATCH 1/3] API: Series[bytes].astype(str) behavior --- pandas/_libs/lib.pyx | 5 ++++- pandas/core/indexes/base.py | 8 ++------ pandas/tests/extension/test_arrow.py | 24 ++++++++++++++++++++++ pandas/tests/indexes/object/test_astype.py | 7 +++++++ pandas/tests/series/methods/test_astype.py | 6 +++++- 5 files changed, 42 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 3769bbf087fee..778eb4e43fba7 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -715,7 +715,10 @@ cpdef ndarray[object] ensure_string_array( continue if not checknull(val): - if not util.is_float_object(val): + if isinstance(val, bytes): + # GH#?? see test_astype_str_from_bytes + result[i] = val.decode() + elif not util.is_float_object(val): # f"{val}" is faster than str(val) result[i] = f"{val}" else: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 27672c82fdf15..0e693eca6f939 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1031,12 +1031,8 @@ def astype(self, dtype, copy: bool = True): new_values = cls._from_sequence(self, dtype=dtype, copy=copy) else: - if dtype == str: - # GH#38607 see test_astype_str_from_bytes - new_values = values.astype(dtype, copy=copy) - else: - # GH#13149 specifically use astype_nansafe instead of astype - new_values = astype_nansafe(values, dtype=dtype, copy=copy) + # GH#13149 specifically use astype_nansafe instead of astype + new_values = astype_nansafe(values, dtype=dtype, copy=copy) # pass copy=False because any copying will be done in the astype above if self._is_backward_compat_public_numeric_index: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d094a7731c417..4942e2c408080 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -34,6 +34,7 @@ pa_version_under9p0, ) from pandas.errors import PerformanceWarning +import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm @@ -234,6 +235,29 @@ def test_astype_str(self, data, request): ) super().test_astype_str(data) + @pytest.mark.parametrize( + "nullable_string_dtype", + [ + "string[python]", + pytest.param( + "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0") + ), + ], + ) + def test_astype_string(self, data, nullable_string_dtype): + # with binary dtype + pa_dtype = data.dtype.pyarrow_dtype + if pa.types.is_binary(pa_dtype): + # in this case we end up doing val.decode() instead of str(val) + # so get e.g. "a" instead of "b'a'" + result = pd.Series(data[:5]).astype(nullable_string_dtype) + expected = pd.Series( + [x.decode() for x in data[:5]], dtype=nullable_string_dtype + ) + self.assert_series_equal(result, expected) + else: + super().test_astype_string(data, nullable_string_dtype) + class TestConstructors(base.BaseConstructorsTests): def test_from_dtype(self, data, request): diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py index 33e45a707df63..de72c84645b4b 100644 --- a/pandas/tests/indexes/object/test_astype.py +++ b/pandas/tests/indexes/object/test_astype.py @@ -3,6 +3,7 @@ from pandas import ( Index, NaT, + Series, ) import pandas._testing as tm @@ -14,6 +15,12 @@ def test_astype_str_from_bytes(): expected = Index(["あ", "a"], dtype="object") tm.assert_index_equal(result, expected) + # while we're here, check that Series.astype behaves the same + + result = Series(idx).astype(str) + expected = Series(expected) + tm.assert_series_equal(result, expected) + def test_astype_invalid_nas_to_tdt64_raises(): # GH#45722 don't cast np.datetime64 NaTs to timedelta64 NaT diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 768cc50857e50..2535a83327fe6 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -378,7 +378,11 @@ def test_astype_unicode(self): former_encoding = None if sys.getdefaultencoding() == "utf-8": - test_series.append(Series(["野菜食べないとやばい".encode()])) + item = "野菜食べないとやばい" + ser = Series([item.encode()]) + res = ser.astype("unicode") + expected = Series([item]) + tm.assert_series_equal(res, expected) for ser in test_series: res = ser.astype("unicode") From f95e7121892a51a2031ba9316ff5031ec81de8e7 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 17 Nov 2022 09:49:41 -0800 Subject: [PATCH 2/3] choose Series behavior --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/_libs/lib.pyx | 5 +---- pandas/tests/extension/test_arrow.py | 24 ---------------------- pandas/tests/indexes/object/test_astype.py | 6 ++++-- pandas/tests/series/methods/test_astype.py | 6 +----- 5 files changed, 7 insertions(+), 35 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 4577d20a509ce..268313660aea3 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -343,6 +343,7 @@ Other API changes - Passing a sequence containing a type that cannot be converted to :class:`Timedelta` to :func:`to_timedelta` or to the :class:`Series` or :class:`DataFrame` constructor with ``dtype="timedelta64[ns]"`` or to :class:`TimedeltaIndex` now raises ``TypeError`` instead of ``ValueError`` (:issue:`49525`) - Changed behavior of :class:`Index` constructor with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`) - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`) +- Changed behavior of :meth:`Index.astype` from object-dtype containing ``bytes`` objects to string dtypes; this now does ``str(val)"`` on bytes objects instead of ``val.decode()``, matching :meth:`Series.astype` behavior (:issue:`45326`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 778eb4e43fba7..3769bbf087fee 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -715,10 +715,7 @@ cpdef ndarray[object] ensure_string_array( continue if not checknull(val): - if isinstance(val, bytes): - # GH#?? see test_astype_str_from_bytes - result[i] = val.decode() - elif not util.is_float_object(val): + if not util.is_float_object(val): # f"{val}" is faster than str(val) result[i] = f"{val}" else: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 4942e2c408080..d094a7731c417 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -34,7 +34,6 @@ pa_version_under9p0, ) from pandas.errors import PerformanceWarning -import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm @@ -235,29 +234,6 @@ def test_astype_str(self, data, request): ) super().test_astype_str(data) - @pytest.mark.parametrize( - "nullable_string_dtype", - [ - "string[python]", - pytest.param( - "string[pyarrow]", marks=td.skip_if_no("pyarrow", min_version="1.0.0") - ), - ], - ) - def test_astype_string(self, data, nullable_string_dtype): - # with binary dtype - pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_binary(pa_dtype): - # in this case we end up doing val.decode() instead of str(val) - # so get e.g. "a" instead of "b'a'" - result = pd.Series(data[:5]).astype(nullable_string_dtype) - expected = pd.Series( - [x.decode() for x in data[:5]], dtype=nullable_string_dtype - ) - self.assert_series_equal(result, expected) - else: - super().test_astype_string(data, nullable_string_dtype) - class TestConstructors(base.BaseConstructorsTests): def test_from_dtype(self, data, request): diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py index de72c84645b4b..5e9b94f42c22c 100644 --- a/pandas/tests/indexes/object/test_astype.py +++ b/pandas/tests/indexes/object/test_astype.py @@ -10,13 +10,15 @@ def test_astype_str_from_bytes(): # https://github.com/pandas-dev/pandas/issues/38607 + # GH#49658 pre-2.0 Index called .values.astype(str) here, which effectively + # did a .decode() on the bytes object. In 2.0 we go through + # ensure_string_array which does f"{val}" idx = Index(["あ", b"a"], dtype="object") result = idx.astype(str) - expected = Index(["あ", "a"], dtype="object") + expected = Index(["あ", "b'a'"], dtype="object") tm.assert_index_equal(result, expected) # while we're here, check that Series.astype behaves the same - result = Series(idx).astype(str) expected = Series(expected) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 2535a83327fe6..768cc50857e50 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -378,11 +378,7 @@ def test_astype_unicode(self): former_encoding = None if sys.getdefaultencoding() == "utf-8": - item = "野菜食べないとやばい" - ser = Series([item.encode()]) - res = ser.astype("unicode") - expected = Series([item]) - tm.assert_series_equal(res, expected) + test_series.append(Series(["野菜食べないとやばい".encode()])) for ser in test_series: res = ser.astype("unicode") From e5f68602caddbace5364212dcc7cc51f7fbc9552 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 10 Feb 2023 17:56:27 -0800 Subject: [PATCH 3/3] use .decode --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/_libs/lib.pyx | 5 ++++- pandas/tests/extension/base/casting.py | 7 +++++-- pandas/tests/indexes/object/test_astype.py | 2 +- pandas/tests/series/methods/test_astype.py | 8 +++++++- 5 files changed, 18 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index e92f9b2f7e366..656f6f13b4e82 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -773,7 +773,7 @@ Other API changes - :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`) - Changed behavior of :class:`Index`, :class:`Series`, and :class:`DataFrame` arithmetic methods when working with object-dtypes, the results no longer do type inference on the result of the array operations, use ``result.infer_objects(copy=False)`` to do type inference on the result (:issue:`49999`, :issue:`49714`) - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`) -- Changed behavior of :meth:`Index.astype` from object-dtype containing ``bytes`` objects to string dtypes; this now does ``str(val)"`` on bytes objects instead of ``val.decode()``, matching :meth:`Series.astype` behavior (:issue:`45326`) +- Changed behavior of :meth:`Series.astype` from object-dtype containing ``bytes`` objects to string dtypes; this now does ``val.decode()"`` on bytes objects instead of ``str(val)``, matching :meth:`Index.astype` behavior (:issue:`45326`) - Added ``"None"`` to default ``na_values`` in :func:`read_csv` (:issue:`50286`) - Changed behavior of :class:`Series` and :class:`DataFrame` constructors when given an integer dtype and floating-point data that is not round numbers, this now raises ``ValueError`` instead of silently retaining the float dtype; do ``Series(data)`` or ``DataFrame(data)`` to get the old behavior, and ``Series(data).astype(dtype)`` or ``DataFrame(data).astype(dtype)`` to get the specified dtype (:issue:`49599`) - Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d79f7068effc3..04b1266e4df17 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -777,7 +777,10 @@ cpdef ndarray[object] ensure_string_array( already_copied = True if not checknull(val): - if not util.is_float_object(val): + if isinstance(val, bytes): + # GH#49658 discussion of desired behavior here + result[i] = val.decode() + elif not util.is_float_object(val): # f"{val}" is faster than str(val) result[i] = f"{val}" else: diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 6e1795b150b27..89ea1670d9e7b 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -55,9 +55,12 @@ def test_astype_str(self, data): ], ) def test_astype_string(self, data, nullable_string_dtype): - # GH-33465 + # GH-33465, GH#45326 as of 2.0 we decode bytes instead of calling str(obj) result = pd.Series(data[:5]).astype(nullable_string_dtype) - expected = pd.Series([str(x) for x in data[:5]], dtype=nullable_string_dtype) + expected = pd.Series( + [str(x) if not isinstance(x, bytes) else x.decode() for x in data[:5]], + dtype=nullable_string_dtype, + ) self.assert_series_equal(result, expected) def test_to_numpy(self, data): diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py index 5e9b94f42c22c..273b39b5e319d 100644 --- a/pandas/tests/indexes/object/test_astype.py +++ b/pandas/tests/indexes/object/test_astype.py @@ -15,7 +15,7 @@ def test_astype_str_from_bytes(): # ensure_string_array which does f"{val}" idx = Index(["あ", b"a"], dtype="object") result = idx.astype(str) - expected = Index(["あ", "b'a'"], dtype="object") + expected = Index(["あ", "a"], dtype="object") tm.assert_index_equal(result, expected) # while we're here, check that Series.astype behaves the same diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index ce17614e1f8b7..aae51ebc5a017 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -389,7 +389,13 @@ def test_astype_unicode(self): former_encoding = None if sys.getdefaultencoding() == "utf-8": - test_series.append(Series(["野菜食べないとやばい".encode()])) + # GH#45326 as of 2.0 Series.astype matches Index.astype by handling + # bytes with obj.decode() instead of str(obj) + item = "野菜食べないとやばい" + ser = Series([item.encode()]) + result = ser.astype("unicode") + expected = Series([item]) + tm.assert_series_equal(result, expected) for ser in test_series: res = ser.astype("unicode")