diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index f47fa48eb6202..9983de4e4d135 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -974,12 +974,14 @@ On a ``Series``, multiple functions return a ``Series``, indexed by the function Passing a ``lambda`` function will yield a ```` named row: .. ipython:: python + :okwarning: tsdf['A'].agg(['sum', lambda x: x.mean()]) Passing a named function will yield that name for the row: .. ipython:: python + :okwarning: def mymean(x): return x.mean() @@ -1034,6 +1036,7 @@ With ``.agg()`` is it possible to easily create a custom describe function, simi to the built in :ref:`describe function `. .. ipython:: python + :okwarning: from functools import partial @@ -1066,7 +1069,6 @@ Transform the entire frame. ``.transform()`` allows input functions as: a NumPy function name or a user defined function. .. ipython:: python - :okwarning: tsdf.transform(np.abs) tsdf.transform('abs') @@ -1093,6 +1095,7 @@ The first level will be the original frame column names; the second level will be the names of the transforming functions. .. ipython:: python + :okwarning: tsdf.transform([np.abs, lambda x: x + 1]) @@ -1100,6 +1103,7 @@ Passing multiple functions to a Series will yield a DataFrame. The resulting column names will be the transforming functions. .. ipython:: python + :okwarning: tsdf['A'].transform([np.abs, lambda x: x + 1]) @@ -1111,6 +1115,7 @@ Transforming with a dict Passing a dict of functions will allow selective transforming per column. .. ipython:: python + :okwarning: tsdf.transform({'A': np.abs, 'B': lambda x: x + 1}) @@ -1138,6 +1143,7 @@ a single value and returning a single value. For example: df4 = df_orig.copy() .. ipython:: python + :okwarning: df4 diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index a45d7a4fa1547..28969358ecfc0 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -99,6 +99,7 @@ Missing values will be propagated, and the data will be coerced to another dtype if needed. .. ipython:: python + :okwarning: s = pd.Series([1, 2, None], dtype="Int64") @@ -129,6 +130,7 @@ These dtypes can operate as part of of ``DataFrame``. These dtypes can be merged & reshaped & casted. .. ipython:: python + :okwarning: pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes df['A'].astype(float) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 82e01b62efbb9..f22dc7b86bcf7 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -392,6 +392,7 @@ Or you can use the :func:`~pandas.to_numeric` function to coerce the dtypes after reading in the data, .. ipython:: python + :okwarning: df2 = pd.read_csv(StringIO(data)) df2['col_1'] = pd.to_numeric(df2['col_1'], errors='coerce') diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index c258a8840b714..adcaff988887a 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -26,6 +26,7 @@ The sparse objects exist for memory efficiency reasons. Suppose you had a large, mostly NA ``DataFrame``: .. ipython:: python + :okwarning: df = pd.DataFrame(np.random.randn(10000, 4)) df.iloc[:9998] = np.nan @@ -300,6 +301,7 @@ meth:`Series.sparse.to_coo` is implemented for transforming a ``Series`` with sp The method requires a ``MultiIndex`` with two or more levels. .. ipython:: python + :okwarning: s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 7a8400d124b22..3a8eace85b9e2 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -86,6 +86,7 @@ l. For ``StringDtype``, :ref:`string accessor methods` Both outputs are ``Int64`` dtype. Compare that with object-dtype .. ipython:: python + :okwarning: s.astype(object).str.count("a") s.astype(object).dropna().str.count("a") diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index 3439a0a4c13c7..2c60134aba1c3 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -237,6 +237,7 @@ or by astyping to a specific timedelta type. These operations yield Series and p Note that division by the NumPy scalar is true division, while astyping is equivalent of floor division. .. ipython:: python + :okwarning: december = pd.Series(pd.date_range('20121201', periods=4)) january = pd.Series(pd.date_range('20130101', periods=4)) diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst index 148ee349b049c..ad3aa735f2b22 100644 --- a/doc/source/whatsnew/v0.11.0.rst +++ b/doc/source/whatsnew/v0.11.0.rst @@ -305,6 +305,7 @@ Furthermore ``datetime64[ns]`` columns are created by default, when passed datet Astype conversion on ``datetime64[ns]`` to ``object``, implicitly converts ``NaT`` to ``np.nan`` .. ipython:: python + :okwarning: s = pd.Series([datetime.datetime(2001, 1, 2, 0, 0) for i in range(3)]) s.dtype diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index 43c6083fdce8f..6200e5c65ac0a 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -532,6 +532,7 @@ Enhancements is frequency conversion. See :ref:`the docs` for the docs. .. ipython:: python + :okwarning: import datetime td = pd.Series(pd.date_range('20130101', periods=4)) - pd.Series( diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 85de0150a5a28..c37bbf81254a4 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -57,6 +57,7 @@ marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` Operations on these dtypes will propagate ``NaN`` as other pandas operations. .. ipython:: python + :okwarning: # arithmetic s + 1 @@ -85,6 +86,7 @@ These dtypes can operate as part of a ``DataFrame``. These dtypes can be merged, reshaped, and casted. .. ipython:: python + :okwarning: pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes df['A'].astype(float) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b9cc1dad53674..bb9f287eb5f9b 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -560,6 +560,30 @@ Documentation Improvements Deprecations ~~~~~~~~~~~~ +String conversion of Series with nan +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:Series.astype(str) previously would coerce a np.nan to the string nan. Now pandas will preserve the missing value indicator (:issue:`25353`) + +*Previous behavior*: + +.. code-block:: ipython + + In [1]: pd.Series(['foo', np.nan]).astype(str) + Out[2]: + 0 foo + 1 nan + dtype: object + +*New behavior*: + +.. ipython:: python + pd.Series(['foo', np.nan]).astype(str) + + +Other deprecations +^^^^^^^^^^^^^^^^^^ + - :meth:`Series.item` and :meth:`Index.item` have been _undeprecated_ (:issue:`29250`) - ``Index.set_value`` has been deprecated. For a given index ``idx``, array ``arr``, value in ``idx`` of ``idx_val`` and a new value of ``val``, ``idx.set_value(arr, idx_val, val)`` diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 0579c97747bae..88ae337ef784e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -789,7 +789,7 @@ def conv(r, dtype): return [conv(r, dtype) for r, dtype in zip(result, dtypes)] -def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): +def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = True): """ Cast the elements of an array to a given dtype a nan-safe manner. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f5b0ce1ae77fb..cacd859d1ad53 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5526,6 +5526,8 @@ def astype( - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object. + .. versionadded:: 0.20.0 + Returns ------- casted : same type as caller @@ -5603,6 +5605,13 @@ def astype( 1 2 dtype: int64 """ + if isna(self.values).any(): + msg = ( + "The meaning of the missing value indicator is preserved " + "by default in the future version." + ) + warnings.warn(msg, FutureWarning, stacklevel=2) + if is_dict_like(dtype): if self.ndim == 1: # i.e. Series if len(dtype) > 1 or self.name not in dtype: @@ -5623,7 +5632,7 @@ def astype( for col_name, col in self.items(): if col_name in dtype: results.append( - col.astype(dtype=dtype[col_name], copy=copy, errors=errors) + col.astype(dtype=dtype[col_name], copy=copy, errors=errors,) ) else: results.append(col.copy() if copy else col) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5d240a3d7821f..74aa7d4b850d7 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -532,8 +532,10 @@ def f(mask, val, idx): return self.split_and_operate(None, f, False) def astype(self, dtype, copy: bool = False, errors: str = "raise"): - """ - Coerce to the new dtype. + return self._astype(dtype, copy=copy, errors=errors) + + def _astype(self, dtype, copy=False, errors="raise"): + """Coerce to the new type Parameters ---------- diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 06bb040224455..722fe909a5aaf 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -580,13 +580,13 @@ def test_astype_str(self): tm.assert_frame_equal(result, expected) def test_astype_str_float(self): - # see gh-11302 + # GH 25353 result = DataFrame([np.NaN]).astype(str) - expected = DataFrame(["nan"]) - + expected = DataFrame([np.nan], dtype=object) tm.assert_frame_equal(result, expected) - result = DataFrame([1.12345678901234567890]).astype(str) + # see gh-11302 + result = DataFrame([1.12345678901234567890]).astype(str) # < 1.14 truncates # >= 1.14 preserves the full repr val = "1.12345678901" if _np_version_under1p14 else "1.1234567890123457" diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 7400b049961d5..c1a0d1b2eb6d4 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1102,7 +1102,10 @@ def test_mode_numerical_nan(self, dropna, expected): @pytest.mark.parametrize( "dropna, expected1, expected2, expected3", - [(True, ["b"], ["bar"], ["nan"]), (False, ["b"], [np.nan], ["nan"])], + [ + (True, ["b"], ["bar"], Series(["bar"])), + (False, ["b"], [np.nan], Series([np.nan], dtype=object)), + ], ) def test_mode_str_obj(self, dropna, expected1, expected2, expected3): # Test string and object types. @@ -1124,7 +1127,6 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3): s = Series(data, dtype=object).astype(str) result = s.mode(dropna) - expected3 = Series(expected3, dtype=str) tm.assert_series_equal(result, expected3) @pytest.mark.parametrize( diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 4bf2f1bd82eff..555e11fd636e3 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -119,15 +119,11 @@ def test_astype_datetime64tz(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", [str, np.str_]) - @pytest.mark.parametrize( - "series", - [ - Series([string.digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)]), - Series([string.digits * 10, tm.rands(63), tm.rands(64), np.nan, 1.0]), - ], - ) - def test_astype_str_map(self, dtype, series): + def test_astype_str_map(self, dtype): # see gh-4405 + series = Series( + [string.digits * 10, tm.rands(63), tm.rands(64), tm.rands(1000)] + ) result = series.astype(dtype) expected = series.map(str) tm.assert_series_equal(result, expected) @@ -152,6 +148,19 @@ def test_astype_str_cast(self): expected = Series([str("1 days 00:00:00.000000000")]) tm.assert_series_equal(s, expected) + def test_astype_str(self): + # GH 25353 + ser = Series([1, "a", np.nan]) + result = ser.astype(str) + expected = Series(["1", "a", np.nan]) + tm.assert_series_equal(result, expected) + + def test_deprecate_astype_str(self): + # GH 25353 + ser = Series([1, "a", np.nan]) + with tm.assert_produces_warning(expected_warning=FutureWarning): + ser.astype(str) + def test_astype_unicode(self): # see gh-7758: A bit of magic is required to set # default encoding to utf-8