From 47f66760879469dec6c0b8a5aa2a8d389e5ed99f Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 21 Mar 2020 18:14:05 +0100 Subject: [PATCH 01/13] BUG: Fix replacing in `string` series with NA (pandas-dev#32621) The pd.NA values are replaced with np.nan before comparing the arrays/scalars --- pandas/core/internals/managers.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 66e96af05eb71..e57d546a91536 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1936,6 +1936,12 @@ def _compare_or_regex_search(a, b, regex=False): is_a_array = isinstance(a, np.ndarray) is_b_array = isinstance(b, np.ndarray) + # Replace all definitions of missing values (isna=True) to a numpy.nan + if is_a_array: + a = np.where(isna(a), np.nan, a) + if is_b_array: + b = np.where(isna(b), np.nan, b) + if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): # GH#29553 avoid deprecation warnings from numpy result = False From 2b5320047fd8d69f0957cd784915b7525abe86d8 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 21 Mar 2020 19:33:38 +0100 Subject: [PATCH 02/13] BUG: Fix replacing in `string` series with NA (pandas-dev#32621) Made improvements based on the tests which failed --- pandas/core/internals/managers.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index e57d546a91536..d82f6e973e143 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1936,11 +1936,18 @@ def _compare_or_regex_search(a, b, regex=False): is_a_array = isinstance(a, np.ndarray) is_b_array = isinstance(b, np.ndarray) + def _get_nan_value(x): + if np.issubdtype(x.dtype, np.datetime64): + return np.datetime64('NaT') + elif np.issubdtype(x.dtype, np.timedelta64): + return np.timedelta64('NaT') + return np.nan + # Replace all definitions of missing values (isna=True) to a numpy.nan if is_a_array: - a = np.where(isna(a), np.nan, a) + a = np.where(isna(a), _get_nan_value(a), a) if is_b_array: - b = np.where(isna(b), np.nan, b) + b = np.where(isna(b), _get_nan_value(b), b) if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): # GH#29553 avoid deprecation warnings from numpy From 7678495dee0027517468ace84d4ad52575133e09 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 21 Mar 2020 19:51:59 +0100 Subject: [PATCH 03/13] BUG: Fix replacing in `string` series with NA (pandas-dev#32621) Added change to resolve linting check --- pandas/core/internals/managers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d82f6e973e143..ff0b4b66f8e90 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1938,9 +1938,9 @@ def _compare_or_regex_search(a, b, regex=False): def _get_nan_value(x): if np.issubdtype(x.dtype, np.datetime64): - return np.datetime64('NaT') + return np.datetime64("NaT") elif np.issubdtype(x.dtype, np.timedelta64): - return np.timedelta64('NaT') + return np.timedelta64("NaT") return np.nan # Replace all definitions of missing values (isna=True) to a numpy.nan From 719369d1041b16d3a9336dfdb1988194c4fdd697 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sat, 21 Mar 2020 20:54:20 +0100 Subject: [PATCH 04/13] BUG: Fix replacing in `string` series with NA (pandas-dev#32621) Added test for the reported bug --- pandas/tests/series/methods/test_replace.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 904a455870ab1..0cc4d0d146cc1 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -241,6 +241,13 @@ def test_replace2(self): assert (ser[6:10] == -1).all() assert (ser[20:30] == -1).all() + def test_replace_with_dictlike_and_string_dtype(self): + # GH 32621 + s = pd.Series(["one", "two", np.nan], dtype="string") + expected = pd.Series(["1", "2", np.nan]) + result = s.replace({"one": "1", "two": "2"}) + tm.assert_series_equal(expected, result) + def test_replace_with_empty_dictlike(self): # GH 15289 s = pd.Series(list("abcd")) From e98c7c9dec34b1a445d385cf0694670cd7ac0211 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 22 Mar 2020 23:53:19 +0100 Subject: [PATCH 05/13] BUG: Fix replacing in `string` series with NA (pandas-dev#32621) --- pandas/core/internals/managers.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index ff0b4b66f8e90..d8863c446c7a8 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -28,7 +28,7 @@ from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCExtensionArray, ABCSeries -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import isna, na_value_for_dtype import pandas.core.algorithms as algos from pandas.core.arrays.sparse import SparseDtype @@ -1937,23 +1937,22 @@ def _compare_or_regex_search(a, b, regex=False): is_b_array = isinstance(b, np.ndarray) def _get_nan_value(x): - if np.issubdtype(x.dtype, np.datetime64): - return np.datetime64("NaT") - elif np.issubdtype(x.dtype, np.timedelta64): - return np.timedelta64("NaT") - return np.nan - - # Replace all definitions of missing values (isna=True) to a numpy.nan + # GH#32621 replace all pd.NAs to avoid failure of element-wise comparison + mask = isna(a) | isna(b) if is_a_array: - a = np.where(isna(a), _get_nan_value(a), a) + a = np.where(mask, na_value_for_dtype(a.dtype, compat=False), a) if is_b_array: - b = np.where(isna(b), _get_nan_value(b), b) + b = np.where(mask, na_value_for_dtype(b.dtype, compat=False), b) if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): # GH#29553 avoid deprecation warnings from numpy result = False else: result = op(a) + if isinstance(result, np.ndarray): + result[mask] = na_value_for_dtype(result.dtype, compat=False) + elif isna(result): + result = na_value_for_dtype(np.bool, compat=False) if is_scalar(result) and (is_a_array or is_b_array): type_names = [type(a).__name__, type(b).__name__] From fb8d14396be5c50fc34d762dd0ed674e53b57bb0 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 22 Mar 2020 23:57:21 +0100 Subject: [PATCH 06/13] BUG: Fix replacing in `string` series with NA (pandas-dev#32621) --- pandas/core/internals/managers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d8863c446c7a8..cb8e21002a10e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1935,8 +1935,7 @@ def _compare_or_regex_search(a, b, regex=False): is_a_array = isinstance(a, np.ndarray) is_b_array = isinstance(b, np.ndarray) - - def _get_nan_value(x): + # GH#32621 replace all pd.NAs to avoid failure of element-wise comparison mask = isna(a) | isna(b) if is_a_array: From ca81cb0329c97de7593ad80e7eb8141f5ff72212 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Sun, 29 Mar 2020 13:27:56 +0200 Subject: [PATCH 07/13] BUG: Fix replacing in `string` series with NA (pandas-dev#32621) --- pandas/core/internals/managers.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 51cacbd7150cc..736aec1ae5ca4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -28,7 +28,7 @@ from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas.core.dtypes.missing import isna, na_value_for_dtype +from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algos from pandas.core.arrays.sparse import SparseDtype @@ -1948,22 +1948,28 @@ def _compare_or_regex_search(a, b, regex=False): is_a_array = isinstance(a, np.ndarray) is_b_array = isinstance(b, np.ndarray) - # GH#32621 replace all pd.NAs to avoid failure of element-wise comparison - mask = isna(a) | isna(b) - if is_a_array: - a = np.where(mask, na_value_for_dtype(a.dtype, compat=False), a) - if is_b_array: - b = np.where(mask, na_value_for_dtype(b.dtype, compat=False), b) - if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): # GH#29553 avoid deprecation warnings from numpy result = False else: + # GH#32621 use mask to avoid comparing to NAs + if is_a_array and not is_b_array: + mask = np.reshape(~(isna(a)), a.shape) + elif is_b_array and not is_a_array: + mask = np.reshape(~(isna(b)), b.shape) + elif is_a_array and is_b_array: + mask = ~(isna(a) | isna(b)) + + if is_a_array: + a = a[mask] + if is_b_array: + b = b[mask] result = op(a) + if isinstance(result, np.ndarray): - result[mask] = na_value_for_dtype(result.dtype, compat=False) - elif isna(result): - result = na_value_for_dtype(np.bool, compat=False) + tmp = np.zeros(mask.shape, dtype=np.bool) + tmp[mask] = result + result = tmp if is_scalar(result) and (is_a_array or is_b_array): type_names = [type(a).__name__, type(b).__name__] From c32a2cca47fa7e1f3269755f10f376fbfe45bcc1 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Wed, 8 Apr 2020 15:48:22 +0200 Subject: [PATCH 08/13] BUG: Fix replacing in `string` series with NA (#32621) --- pandas/core/internals/managers.py | 56 ++++++++++++++++--------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 9838fdd3e7f71..a4c284ed005cc 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1941,6 +1941,21 @@ def _compare_or_regex_search(a, b, regex=False): ------- mask : array_like of bool """ + def _check(result, a, b): + if is_scalar(result) and (isinstance(a, np.ndarray) or isinstance(b, np.ndarray)): + type_names = [type(a).__name__, type(b).__name__] + + if is_a_array: + type_names[0] = f"ndarray(dtype={a.dtype})" + + if is_b_array: + type_names[1] = f"ndarray(dtype={b.dtype})" + + raise TypeError( + f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" + ) + return result + if not regex: op = lambda x: operator.eq(x, b) else: @@ -1951,42 +1966,29 @@ def _compare_or_regex_search(a, b, regex=False): is_a_array = isinstance(a, np.ndarray) is_b_array = isinstance(b, np.ndarray) + # GH#32621 use mask to avoid comparing to NAs + if is_a_array and not is_b_array: + mask = np.reshape(~(isna(a)), a.shape) + elif is_b_array and not is_a_array: + mask = np.reshape(~(isna(b)), b.shape) + elif is_a_array and is_b_array: + mask = ~(isna(a) | isna(b)) + if is_a_array: + a = a[mask] + if is_b_array: + b = b[mask] + if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): # GH#29553 avoid deprecation warnings from numpy - result = False + return _check(False, a, b) else: - # GH#32621 use mask to avoid comparing to NAs - if is_a_array and not is_b_array: - mask = np.reshape(~(isna(a)), a.shape) - elif is_b_array and not is_a_array: - mask = np.reshape(~(isna(b)), b.shape) - elif is_a_array and is_b_array: - mask = ~(isna(a) | isna(b)) - - if is_a_array: - a = a[mask] - if is_b_array: - b = b[mask] result = op(a) - if isinstance(result, np.ndarray): tmp = np.zeros(mask.shape, dtype=np.bool) tmp[mask] = result result = tmp - if is_scalar(result) and (is_a_array or is_b_array): - type_names = [type(a).__name__, type(b).__name__] - - if is_a_array: - type_names[0] = f"ndarray(dtype={a.dtype})" - - if is_b_array: - type_names[1] = f"ndarray(dtype={b.dtype})" - - raise TypeError( - f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" - ) - return result + return _check(result, a, b) def _fast_count_smallints(arr): From 0a768445e655777040b9f54b5f7e3d6f483f863f Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Wed, 8 Apr 2020 16:02:20 +0200 Subject: [PATCH 09/13] BUG: Fix replacing in `string` series with NA (#32621) --- pandas/core/internals/managers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a4c284ed005cc..1845267a8d570 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1941,8 +1941,11 @@ def _compare_or_regex_search(a, b, regex=False): ------- mask : array_like of bool """ + def _check(result, a, b): - if is_scalar(result) and (isinstance(a, np.ndarray) or isinstance(b, np.ndarray)): + if is_scalar(result) and ( + isinstance(a, np.ndarray) or isinstance(b, np.ndarray) + ): type_names = [type(a).__name__, type(b).__name__] if is_a_array: From b62ad89500693ae80de7f36e8c49316c484efb7d Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Wed, 8 Apr 2020 21:37:44 +0200 Subject: [PATCH 10/13] BUG: Fix replacing in `string` series with NA (pandas-dev#32621) --- pandas/core/internals/managers.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 1845267a8d570..d8162d6474feb 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -8,7 +8,7 @@ import numpy as np from pandas._libs import Timedelta, Timestamp, internals as libinternals, lib -from pandas._typing import ArrayLike, DtypeObj, Label +from pandas._typing import ArrayLike, DtypeObj, Label, Scalar from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -1942,7 +1942,15 @@ def _compare_or_regex_search(a, b, regex=False): mask : array_like of bool """ - def _check(result, a, b): + def _check_comparison_types( + result: Union[ArrayLike, Scalar], + a: Union[ArrayLike, Scalar], + b: Union[ArrayLike, Scalar], + ) -> Union[ArrayLike, Scalar]: + """ + Raises an error if the two arrays cannot be compared, + otherwise returns the comparison result as expected. + """ if is_scalar(result) and ( isinstance(a, np.ndarray) or isinstance(b, np.ndarray) ): @@ -1983,15 +1991,18 @@ def _check(result, a, b): if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): # GH#29553 avoid deprecation warnings from numpy - return _check(False, a, b) - else: - result = op(a) - if isinstance(result, np.ndarray): - tmp = np.zeros(mask.shape, dtype=np.bool) - tmp[mask] = result - result = tmp + return _check_comparison_types(False, a, b) + + result = op(a) + + if isinstance(result, np.ndarray): + # The shape of the mask can differ to that of the result + # since we may compare only a subset of a's or b's elements + tmp = np.zeros(mask.shape, dtype=np.bool) + tmp[mask] = result + result = tmp - return _check(result, a, b) + return _check_comparison_types(result, a, b) def _fast_count_smallints(arr): From a73e2ebe7d96142fd48067bdd561c8d08847774d Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Wed, 8 Apr 2020 22:24:01 +0200 Subject: [PATCH 11/13] BUG: Fix replacing in `string` series with NA (pandas-dev#32621) --- pandas/core/internals/managers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 5a4d1d5afc83b..c9c6f543ece87 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1943,10 +1943,10 @@ def _check_comparison_types( ): type_names = [type(a).__name__, type(b).__name__] - if is_a_array: + if isinstance(a, np.ndarray): type_names[0] = f"ndarray(dtype={a.dtype})" - if is_b_array: + if isinstance(b, np.ndarray): type_names[1] = f"ndarray(dtype={b.dtype})" raise TypeError( From 949accc4efae3db5fb3a0358e897bf824118f07d Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Thu, 9 Apr 2020 00:22:25 +0200 Subject: [PATCH 12/13] BUG: Fix replacing in `string` series with NA (pandas-dev#32621) --- pandas/core/internals/managers.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c9c6f543ece87..46e6a51d0c701 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1911,7 +1911,9 @@ def _merge_blocks( return blocks -def _compare_or_regex_search(a, b, regex=False): +def _compare_or_regex_search( + a: Union[ArrayLike, Scalar], b: Union[ArrayLike, Scalar], regex: bool = False +) -> Union[ArrayLike, bool]: """ Compare two array_like inputs of the same shape or two scalar values @@ -1930,13 +1932,13 @@ def _compare_or_regex_search(a, b, regex=False): """ def _check_comparison_types( - result: Union[ArrayLike, Scalar], + result: Union[ArrayLike, bool], a: Union[ArrayLike, Scalar], b: Union[ArrayLike, Scalar], - ) -> Union[ArrayLike, Scalar]: + ) -> Union[ArrayLike, bool]: """ - Raises an error if the two arrays cannot be compared, - otherwise returns the comparison result as expected. + Raises an error if the two arrays (a,b) cannot be compared. + Otherwise, returns the comparison result as expected. """ if is_scalar(result) and ( isinstance(a, np.ndarray) or isinstance(b, np.ndarray) @@ -1958,22 +1960,21 @@ def _check_comparison_types( op = lambda x: operator.eq(x, b) else: op = np.vectorize( - lambda x: bool(re.search(b, x)) if isinstance(x, str) else False + lambda x: bool(re.search(b, x)) + if isinstance(x, str) and isinstance(b, str) + else False ) - is_a_array = isinstance(a, np.ndarray) - is_b_array = isinstance(b, np.ndarray) - # GH#32621 use mask to avoid comparing to NAs - if is_a_array and not is_b_array: + if isinstance(a, np.ndarray) and not isinstance(b, np.ndarray): mask = np.reshape(~(isna(a)), a.shape) - elif is_b_array and not is_a_array: + elif isinstance(b, np.ndarray) and not isinstance(a, np.ndarray): mask = np.reshape(~(isna(b)), b.shape) - elif is_a_array and is_b_array: + elif isinstance(a, np.ndarray) and isinstance(b, np.ndarray): mask = ~(isna(a) | isna(b)) - if is_a_array: + if isinstance(a, np.ndarray): a = a[mask] - if is_b_array: + if isinstance(b, np.ndarray): b = b[mask] if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): From df5bc395a7e364190a84e79e73d7f02cc43a79d0 Mon Sep 17 00:00:00 2001 From: chrispe92 Date: Thu, 9 Apr 2020 20:19:35 +0200 Subject: [PATCH 13/13] Added description to the 1.1 bug fixes section --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5c39377899a20..f453261deec53 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -452,7 +452,7 @@ Missing ^^^^^^^ - Calling :meth:`fillna` on an empty Series now correctly returns a shallow copied object. The behaviour is now consistent with :class:`Index`, :class:`DataFrame` and a non-empty :class:`Series` (:issue:`32543`). - +- Bug in :meth:`replace` when argument ``to_replace`` is of type dict/list and is used on a :class:`Series` containing ```` was raising a ``TypeError``. The method now handles this by ignoring ```` values when doing the comparison for the replacement (:issue:`32621`) MultiIndex ^^^^^^^^^^