From 4bc0e800790b9844a8525d8fd78a4f652daaaa4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADtor=20Araujo?= Date: Sun, 20 Nov 2022 19:53:15 +0000 Subject: [PATCH 1/6] TST: Tests for replace method when column contains pd.NA (#47480) Tests fail for pd.Series and pd.DataFrame when dtype is not declared. --- pandas/tests/frame/methods/test_replace.py | 8 ++++++++ pandas/tests/series/methods/test_replace.py | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 626bc658b199c..abaa4cb1c4cac 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1503,6 +1503,14 @@ def test_replace_value_none_dtype_numeric(self, val): result = df.replace({val: None}) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("dtype", [None, "Int64", "Float64"]) + def test_replace_df_containing_na(self, dtype): + # GH#47480 + df = DataFrame({"A": [4, 1, pd.NA], "B": [1, 3, 2]}, dtype=dtype) + df.replace(to_replace=1, value=100, inplace=True) + expected = DataFrame({"A": [4, 100, pd.NA], "B": [100, 3, 2]}, dtype=dtype) + tm.assert_frame_equal(df, expected) + class TestDataFrameReplaceRegex: @pytest.mark.parametrize( diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 59afe22e40f7a..b329dc6dd1a87 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -674,3 +674,11 @@ def test_replace_value_none_dtype_numeric(self, val): result = ser.replace(val, None) expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("dtype", [None, "Int64", "Float64"]) + def test_replace_series_containing_na(self, dtype): + # GH#47480 + df = pd.Series([4, 1, pd.NA, 1], dtype=dtype) + df.replace(to_replace=1, value=100, inplace=True) + expected = pd.Series([4, 100, pd.NA, 100], dtype=dtype) + tm.assert_series_equal(df, expected) From c30bd01d031c9818116c0b8cc796e92c2a33932a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADtor=20Araujo?= Date: Mon, 21 Nov 2022 21:29:11 +0000 Subject: [PATCH 2/6] BUG: Replace method raising error in col with pd.NA (#47480) --- pandas/core/missing.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 0d058ead9d22c..33519fed47c6e 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -94,6 +94,11 @@ def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]: pass else: new_mask = arr == x + # GH#47480 + if isinstance(new_mask, bool): + new_mask = np.equal(arr, x, dtype=object) + mask_na = np.vectorize(lambda value: False if isna(value) else value) + new_mask = mask_na(new_mask).astype(bool) if not isinstance(new_mask, np.ndarray): # usually BooleanArray new_mask = new_mask.to_numpy(dtype=bool, na_value=False) From febffcba66bb3175c186362feb62bf8a6cd99111 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADtor=20Araujo?= Date: Mon, 21 Nov 2022 23:08:55 +0000 Subject: [PATCH 3/6] BUG: Replace method raising error in col with pd.NA (#47480) --- pandas/core/missing.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 33519fed47c6e..57516577c048a 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -93,12 +93,11 @@ def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]: # GH#29553 prevent numpy deprecation warnings pass else: - new_mask = arr == x # GH#47480 - if isinstance(new_mask, bool): - new_mask = np.equal(arr, x, dtype=object) - mask_na = np.vectorize(lambda value: False if isna(value) else value) - new_mask = mask_na(new_mask).astype(bool) + mask_func = np.vectorize( + lambda value: False if (isna(value) or value != x) else True + ) + new_mask = mask_func(arr).astype(bool) if not isinstance(new_mask, np.ndarray): # usually BooleanArray new_mask = new_mask.to_numpy(dtype=bool, na_value=False) From 6f90ac3b2ae71430b229b7b6da3c5f49cf249a5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADtor=20Araujo?= Date: Tue, 22 Nov 2022 00:40:55 +0000 Subject: [PATCH 4/6] BUG: Replace method raising error in col with pd.NA (#47480) --- pandas/core/missing.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 57516577c048a..fc5fc54e4cdaf 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -31,7 +31,6 @@ from pandas.core.dtypes.cast import infer_dtype_from from pandas.core.dtypes.common import ( is_array_like, - is_numeric_v_string_like, needs_i8_conversion, ) from pandas.core.dtypes.missing import ( @@ -89,19 +88,11 @@ def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]: # GH 21977 mask = np.zeros(arr.shape, dtype=bool) for x in nonna: - if is_numeric_v_string_like(arr, x): - # GH#29553 prevent numpy deprecation warnings - pass - else: - # GH#47480 - mask_func = np.vectorize( - lambda value: False if (isna(value) or value != x) else True - ) - new_mask = mask_func(arr).astype(bool) - if not isinstance(new_mask, np.ndarray): - # usually BooleanArray - new_mask = new_mask.to_numpy(dtype=bool, na_value=False) - mask |= new_mask + # GH#47480 + new_mask = np.vectorize( + lambda value: False if (isna(value) or value != x) else True + )(arr) + mask |= new_mask if na_mask.any(): mask |= isna(arr) From 44331b457854eecf489eb817357c6b8ec7164482 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADtor=20Araujo?= Date: Wed, 23 Nov 2022 23:57:16 +0000 Subject: [PATCH 5/6] BUG: Replace method raising error in col with pd.NA (#47480) --- pandas/core/missing.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index fc5fc54e4cdaf..40d61a560bee2 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -31,6 +31,7 @@ from pandas.core.dtypes.cast import infer_dtype_from from pandas.core.dtypes.common import ( is_array_like, + is_numeric_v_string_like, needs_i8_conversion, ) from pandas.core.dtypes.missing import ( @@ -87,12 +88,19 @@ def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]: # GH 21977 mask = np.zeros(arr.shape, dtype=bool) + arr_na_mask = ~isna(arr) for x in nonna: - # GH#47480 - new_mask = np.vectorize( - lambda value: False if (isna(value) or value != x) else True - )(arr) - mask |= new_mask + if is_numeric_v_string_like(arr, x): + # GH#29553 prevent numpy deprecation warnings + pass + else: + # GH#47480 + new_mask = np.zeros_like(arr, dtype=bool) + new_mask[arr_na_mask] = arr[arr_na_mask] == x + if not isinstance(new_mask, np.ndarray): + # usually BooleanArray + new_mask = new_mask.to_numpy(dtype=bool, na_value=False) + mask |= new_mask if na_mask.any(): mask |= isna(arr) From 6b2dd72137f633e2fa65acd4cafd330b530a26c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADtor=20Araujo?= Date: Tue, 13 Dec 2022 22:07:59 +0000 Subject: [PATCH 6/6] BUG: Check if array is of numpy type with NAs on mask_missing func (#47480) --- pandas/core/missing.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 40d61a560bee2..2ec1731d050da 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -88,15 +88,20 @@ def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]: # GH 21977 mask = np.zeros(arr.shape, dtype=bool) - arr_na_mask = ~isna(arr) + arr_nas = isna(arr) for x in nonna: if is_numeric_v_string_like(arr, x): # GH#29553 prevent numpy deprecation warnings pass else: - # GH#47480 - new_mask = np.zeros_like(arr, dtype=bool) - new_mask[arr_na_mask] = arr[arr_na_mask] == x + # GH#47480 'arr == x' returns scalar when arr is numpy array + # containing NAs (Usually when Series has dtype object with NAs) + if isinstance(arr, np.ndarray) and arr_nas.any(): + new_mask = np.zeros_like(arr, dtype=bool) + new_mask[~arr_nas] = arr[~arr_nas] == x + else: + new_mask = arr == x + if not isinstance(new_mask, np.ndarray): # usually BooleanArray new_mask = new_mask.to_numpy(dtype=bool, na_value=False)