From 522443f5f4996af2a19f05f714a6a33450a2b9e7 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 18 Sep 2020 19:21:40 -0500 Subject: [PATCH 1/4] BUG: Fix astype from float32 to string --- doc/source/whatsnew/v1.1.3.rst | 2 ++ pandas/_libs/lib.pyx | 3 ++- pandas/core/arrays/string_.py | 3 +-- pandas/tests/arrays/string_/test_string.py | 8 ++++++++ pandas/tests/series/methods/test_astype.py | 9 +++++++++ 5 files changed, 22 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 1d386fa372ce1..90dfd596c38e4 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -35,6 +35,7 @@ Fixed regressions - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a frozenset (:issue:`35747`) - Fixed regression in :meth:`read_excel` with ``engine="odf"`` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`,:issue:`35802`) - Fixed regression in :class:`DataFrame` and :class:`Series` comparisons between numeric arrays and strings (:issue:`35700`,:issue:`36377`) +- Fixed regression in :meth:`Series.astype` showing too much precision when casting from ``np.float32`` to ``str`` (:issue:`36451`) .. --------------------------------------------------------------------------- @@ -47,6 +48,7 @@ Bug fixes - Bug in :class:`Series` constructor where integer overflow would occur for sufficiently large scalar inputs when an index was provided (:issue:`36291`) - Bug in :meth:`DataFrame.sort_values` raising an ``AttributeError`` when sorting on a key that casts column to categorical dtype (:issue:`36383`) - Bug in :meth:`DataFrame.stack` raising a ``ValueError`` when stacking :class:`MultiIndex` columns based on position when the levels had duplicate names (:issue:`36353`) +- Bug in :meth:`Series.astype` showing too much precision when casting from ``np.float32`` to string dtype (:issue:`36451`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index cc63df90a9a9f..0a198ceed517e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -650,11 +650,12 @@ cpdef ndarray[object] ensure_string_array( Py_ssize_t i = 0, n = len(arr) result = np.asarray(arr, dtype="object") + if copy and result is arr: result = result.copy() for i in range(n): - val = result[i] + val = arr[i] if isinstance(val, str): continue diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index cef35f2b1137c..cb1144c18e49c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -198,10 +198,9 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): if dtype: assert dtype == "string" - result = np.asarray(scalars, dtype="object") # convert non-na-likes to str, and nan-likes to StringDtype.na_value result = lib.ensure_string_array( - result, na_value=StringDtype.na_value, copy=copy + scalars, na_value=StringDtype.na_value, copy=copy ) # Manually creating new array avoids the validation step in the __init__, so is diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index efd5d29ae0717..69dc30e6a868a 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -336,3 +336,11 @@ def test_memory_usage(): series = pd.Series(["a", "b", "c"], dtype="string") assert 0 < series.nbytes <= series.memory_usage() < series.memory_usage(deep=True) + + +@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) +def test_astype_from_float_dtype(dtype): + s = pd.Series([0.1], dtype=dtype) + result = s.astype("string") + expected = pd.Series(["0.1"], dtype="string") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index b9d90a9fc63dd..7fbe8d1d00f28 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -1,3 +1,4 @@ +import numpy as np import pytest from pandas import Interval, Series, Timestamp, date_range @@ -46,3 +47,11 @@ def test_astype_ignores_errors_for_extension_dtypes(self, values, errors): msg = "(Cannot cast)|(could not convert)" with pytest.raises((ValueError, TypeError), match=msg): values.astype(float, errors=errors) + + @pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) + def test_astype_from_float32_to_str(self, dtype): + # https://github.com/pandas-dev/pandas/issues/36451 + s = Series([0.1], dtype=dtype) + result = s.astype(str) + expected = Series(["0.1"]) + tm.assert_series_equal(result, expected) From f9f30c635937c63247979ae7c9a21f23fea54328 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 18 Sep 2020 19:42:56 -0500 Subject: [PATCH 2/4] Test name --- pandas/tests/series/methods/test_astype.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 7fbe8d1d00f28..7449d8d65ef96 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -49,7 +49,7 @@ def test_astype_ignores_errors_for_extension_dtypes(self, values, errors): values.astype(float, errors=errors) @pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) - def test_astype_from_float32_to_str(self, dtype): + def test_astype_from_float_to_str(self, dtype): # https://github.com/pandas-dev/pandas/issues/36451 s = Series([0.1], dtype=dtype) result = s.astype(str) From f082eef7d6c1ae5a749441783b55ba52286673ee Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 18 Sep 2020 20:08:06 -0500 Subject: [PATCH 3/4] Issue link --- pandas/tests/arrays/string_/test_string.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 69dc30e6a868a..56a8e21edd004 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -340,6 +340,7 @@ def test_memory_usage(): @pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) def test_astype_from_float_dtype(dtype): + # https://github.com/pandas-dev/pandas/issues/36451 s = pd.Series([0.1], dtype=dtype) result = s.astype("string") expected = pd.Series(["0.1"], dtype="string") From 71facc0a703eea4ddb8afbab076c618870d45432 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 19 Sep 2020 18:24:09 -0500 Subject: [PATCH 4/4] Remove note --- doc/source/whatsnew/v1.1.3.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 6a850c1e7c7b2..72937141c2870 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -35,7 +35,6 @@ Fixed regressions - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a frozenset (:issue:`35747`) - Fixed regression in :meth:`read_excel` with ``engine="odf"`` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`,:issue:`35802`) - Fixed regression in :class:`DataFrame` and :class:`Series` comparisons between numeric arrays and strings (:issue:`35700`,:issue:`36377`) -- Fixed regression in :meth:`Series.astype` showing too much precision when casting from ``np.float32`` to ``str`` (:issue:`36451`) .. ---------------------------------------------------------------------------