From 3b6b8c2b1e44fa8597751216b10fbe693a96d346 Mon Sep 17 00:00:00 2001 From: Tomasz Bochenek Date: Tue, 30 Jun 2020 10:56:48 +0200 Subject: [PATCH 1/3] Add skipna option to astype method --- pandas/core/generic.py | 39 +++++++++++++++++++++++++++--- pandas/core/internals/blocks.py | 16 +++++++++--- pandas/core/internals/managers.py | 6 +++-- pandas/tests/series/test_dtypes.py | 11 +++++++++ 4 files changed, 62 insertions(+), 10 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a66cade3b81b0..5536567de0c06 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5383,7 +5383,11 @@ def _to_dict_of_blocks(self, copy: bool_t = True): } def astype( - self: FrameOrSeries, dtype, copy: bool_t = True, errors: str = "raise" + self: FrameOrSeries, + dtype, + copy: bool_t = True, + errors: str = "raise", + skipna: bool_t = False, ) -> FrameOrSeries: """ Cast a pandas object to a specified dtype ``dtype``. @@ -5404,6 +5408,10 @@ def astype( - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object. + skipna : bool, default False + When ``deep=False`` (default) nan values will be casted to proper + dtype. + Preserve nan values when ``skipna=True``. Returns ------- @@ -5499,6 +5507,22 @@ def astype( 1 2020-01-01 19:00:00-05:00 2 2020-01-02 19:00:00-05:00 dtype: datetime64[ns, US/Eastern] + + + By default NaN values will be casted to dtype: + >>> pd.Series([None, 1]).astype(str) + 0 nan + 1 1.0 + dtype: object + + + Skip casting NaN values: + + >>> pd.Series([None, 1]).astype(str, skipna=True) + 0 NaN + 1 42.0 + dtype: object + """ if is_dict_like(dtype): if self.ndim == 1: # i.e. Series @@ -5520,7 +5544,12 @@ def astype( for col_name, col in self.items(): if col_name in dtype: results.append( - col.astype(dtype=dtype[col_name], copy=copy, errors=errors) + col.astype( + dtype=dtype[col_name], + copy=copy, + errors=errors, + skipna=skipna, + ) ) else: results.append(col.copy() if copy else col) @@ -5529,13 +5558,15 @@ def astype( # GH 18099/22869: columnwise conversion to extension dtype # GH 24704: use iloc to handle duplicate column names results = [ - self.iloc[:, i].astype(dtype, copy=copy) + self.iloc[:, i].astype(dtype, copy=copy, skipna=skipna) for i in range(len(self.columns)) ] else: # else, only a single dtype is given - new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,) + new_data = self._mgr.astype( + dtype=dtype, copy=copy, errors=errors, skipna=skipna + ) return self._constructor(new_data).__finalize__(self, method="astype") # GH 33113: handle empty frame or series diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6207785fb2975..3d2013444f2c0 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -516,7 +516,9 @@ def f(mask, val, idx): return self.split_and_operate(None, f, False) - def astype(self, dtype, copy: bool = False, errors: str = "raise"): + def astype( + self, dtype, copy: bool = False, errors: str = "raise", skipna: bool = False + ): """ Coerce to the new dtype. @@ -528,6 +530,10 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): errors : str, {'raise', 'ignore'}, default 'ignore' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object + skipna : bool, default False + When ``skipna=False`` (default) nan values will be casted to proper + dtype. + Skip casting nan values when ``skipna=True``. Returns ------- @@ -592,7 +598,7 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): # _astype_nansafe works fine with 1-d only vals1d = values.ravel() try: - values = astype_nansafe(vals1d, dtype, copy=True) + values = astype_nansafe(vals1d, dtype, copy=True, skipna=skipna) except (ValueError, TypeError): # e.g. astype_nansafe can fail on object-dtype of strings # trying to convert to float @@ -2094,7 +2100,9 @@ def _maybe_coerce_values(self, values): assert isinstance(values, np.ndarray), type(values) return values - def astype(self, dtype, copy: bool = False, errors: str = "raise"): + def astype( + self, dtype, copy: bool = False, errors: str = "raise", skipna: bool = False + ): """ these automatically copy, so copy=True has no effect raise on an except if raise == True @@ -2113,7 +2121,7 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): return self.make_block(values) # delegate - return super().astype(dtype=dtype, copy=copy, errors=errors) + return super().astype(dtype=dtype, copy=copy, errors=errors, skipna=skipna) def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b2f2277d9a7dc..79244600e4e0a 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -560,9 +560,11 @@ def downcast(self) -> "BlockManager": return self.apply("downcast") def astype( - self, dtype, copy: bool = False, errors: str = "raise" + self, dtype, copy: bool = False, errors: str = "raise", skipna: bool = False ) -> "BlockManager": - return self.apply("astype", dtype=dtype, copy=copy, errors=errors) + return self.apply( + "astype", dtype=dtype, copy=copy, errors=errors, skipna=skipna + ) def convert( self, diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index bcc0b18134dad..ad35402db709a 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -495,3 +495,14 @@ def test_reindex_astype_order_consistency(self): s1 = s.reindex(new_index).astype(temp_dtype).astype(new_dtype) s2 = s.astype(temp_dtype).reindex(new_index).astype(new_dtype) tm.assert_series_equal(s1, s2) + + def test_astype_skipna_default(self): + arr = Series([1.0, np.nan, 3.0, 4.0]) + result = arr.astype(str) + tm.assert_series_equal(result, Series(["1.0", "nan", "3.0", "4.0"])) + + def test_astype_skipna_true(self): + # GH 31708 + arr = Series([1.0, np.nan, 3.0, 4.0]) + result = arr.astype(str, skipna=True) + tm.assert_series_equal(result, Series(["1.0", np.nan, "3.0", "4.0"])) From 5208d01121ea5b1020d7f0375facac819e14b600 Mon Sep 17 00:00:00 2001 From: Tomasz Bochenek Date: Tue, 30 Jun 2020 11:42:48 +0200 Subject: [PATCH 2/3] Docstring fixes --- pandas/core/generic.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5536567de0c06..47e9a39ceb171 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5511,8 +5511,8 @@ def astype( By default NaN values will be casted to dtype: >>> pd.Series([None, 1]).astype(str) - 0 nan - 1 1.0 + 0 nan + 1 1.0 dtype: object @@ -5520,9 +5520,8 @@ def astype( >>> pd.Series([None, 1]).astype(str, skipna=True) 0 NaN - 1 42.0 + 1 42.0 dtype: object - """ if is_dict_like(dtype): if self.ndim == 1: # i.e. Series From 5b37d5267345fa63afdf49856ceffdd555a72c50 Mon Sep 17 00:00:00 2001 From: Tomasz Bochenek Date: Tue, 30 Jun 2020 12:12:08 +0200 Subject: [PATCH 3/3] Docstring fixes --- pandas/core/generic.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 47e9a39ceb171..42d9ddc0605bf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5508,19 +5508,17 @@ def astype( 2 2020-01-02 19:00:00-05:00 dtype: datetime64[ns, US/Eastern] - By default NaN values will be casted to dtype: >>> pd.Series([None, 1]).astype(str) 0 nan 1 1.0 dtype: object - Skip casting NaN values: >>> pd.Series([None, 1]).astype(str, skipna=True) - 0 NaN - 1 42.0 + 0 NaN + 1 1.0 dtype: object """ if is_dict_like(dtype):