From 6782445b10b515664753ca1809109e839d9c843d Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 19 Apr 2020 10:44:08 -0500 Subject: [PATCH 01/10] BUG: Fix StringArray use_inf_as_na bug --- doc/source/whatsnew/v1.1.0.rst | 4 ++-- pandas/_libs/missing.pyx | 3 ++- pandas/core/dtypes/missing.py | 6 +++++- pandas/tests/arrays/string_/test_string.py | 22 ++++++++++++++++++++++ 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a797090a83444..004eb0918a405 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -635,8 +635,8 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ -- Fixed bug where :meth:`Serires.value_counts` would raise on empty input of ``Int64`` dtype (:issue:`33317`) -- +- Fixed bug where :meth:`Series.value_counts` would raise on empty input of ``Int64`` dtype (:issue:`33317`) +- Fixed bug where :meth:`StringArray.isna` would return ``False`` for NA values when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`33655`) Other diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index dacf454824190..a2a796d0ac98d 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -140,6 +140,7 @@ def isnaobj_old(arr: ndarray) -> ndarray: - INF - NEGINF - NaT + - NA Parameters ---------- @@ -160,7 +161,7 @@ def isnaobj_old(arr: ndarray) -> ndarray: result = np.zeros(n, dtype=np.uint8) for i in range(n): val = arr[i] - result[i] = val is NaT or _check_none_nan_inf_neginf(val) + result[i] = checknull(val) or val == INF or val == NEGINF return result.view(np.bool_) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 08a6d42042c1c..949842afac6a0 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -231,7 +231,11 @@ def _isna_ndarraylike(obj): def _isna_ndarraylike_old(obj): - values = getattr(obj, "_values", obj) + if not isinstance(obj, np.ndarray): + values = obj.to_numpy() + else: + values = obj + dtype = values.dtype if is_string_dtype(dtype): diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index fe770eed84b62..429c81fc97bd9 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -277,3 +277,25 @@ def test_value_counts_na(): result = arr.value_counts(dropna=True) expected = pd.Series([2, 1], index=["a", "b"], dtype="Int64") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "values, expected", + [ + (pd.array(["a", "b", "c"]), np.array([False, False, False])), + (pd.array(["a", "b", None]), np.array([False, False, True])), + ], +) +def test_use_na_as_inf(values, expected): + # https://github.com/pandas-dev/pandas/issues/33655 + with pd.option_context("mode.use_inf_as_na", True): + result = values.isna() + tm.assert_numpy_array_equal(result, expected) + + result = pd.Series(values).isna() + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) + + result = pd.DataFrame(values).isna() + expected = pd.DataFrame(expected) + tm.assert_frame_equal(result, expected) From 66a438de1942b81948b9a8f0f21459f6dae728b5 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Mon, 20 Apr 2020 07:26:08 -0500 Subject: [PATCH 02/10] Update --- pandas/_libs/missing.pyx | 5 ----- pandas/core/dtypes/missing.py | 9 +++------ 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index a2a796d0ac98d..42184c104462c 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -89,11 +89,6 @@ cpdef bint checknull_old(object val): return False -cdef inline bint _check_none_nan_inf_neginf(object val): - return val is None or (isinstance(val, float) and - (val != val or val == INF or val == NEGINF)) - - @cython.wraparound(False) @cython.boundscheck(False) cpdef ndarray[uint8_t] isnaobj(ndarray arr): diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 949842afac6a0..5ff5af950f2e9 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -231,16 +231,13 @@ def _isna_ndarraylike(obj): def _isna_ndarraylike_old(obj): - if not isinstance(obj, np.ndarray): - values = obj.to_numpy() - else: - values = obj - + values = getattr(obj, "_values", obj) dtype = values.dtype + if is_extension_array_dtype(dtype): + result = values.isna() | (values == -np.inf) | (values == np.inf) if is_string_dtype(dtype): result = _isna_string_dtype(values, dtype, old=True) - elif needs_i8_conversion(dtype): # this is the NaT pattern result = values.view("i8") == iNaT From 5f7db1330bdc82cffa15e3550e0efe0c013c1dea Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Mon, 20 Apr 2020 07:28:35 -0500 Subject: [PATCH 03/10] Fix --- pandas/core/dtypes/missing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index b736fdef9f438..04d12d999afab 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -235,7 +235,7 @@ def _isna_ndarraylike_old(obj): if is_extension_array_dtype(dtype): result = values.isna() | (values == -np.inf) | (values == np.inf) - if is_string_dtype(dtype): + elif is_string_dtype(dtype): result = _isna_string_dtype(values, dtype, old=True) elif needs_i8_conversion(dtype): # this is the NaT pattern From df626f5a6169fe6d8f845591860791f3b2f782af Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Mon, 27 Apr 2020 21:31:00 -0500 Subject: [PATCH 04/10] Cast to numpy --- pandas/core/dtypes/missing.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 92e1b17c41694..e9f0e23ea966d 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -223,15 +223,13 @@ def _isna_ndarraylike(obj, old: bool = False): array-like Array of boolean values denoting the NA status of each element. """ - values = getattr(obj, "_values", obj) + if not isinstance(obj, np.ndarray): + values = obj.to_numpy() + else: + values = obj dtype = values.dtype - if is_extension_array_dtype(dtype): - if old: - result = values.isna() | (values == -np.inf) | (values == np.inf) - else: - result = values.isna() - elif is_string_dtype(dtype): + if is_string_dtype(dtype): result = _isna_string_dtype(values, dtype, old=old) elif needs_i8_conversion(dtype): # this is the NaT pattern From 6d4afbc9c6d04a6b7b8fe84715baa21b7f1f862a Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Mon, 27 Apr 2020 21:56:04 -0500 Subject: [PATCH 05/10] Revert "Cast to numpy" This reverts commit df626f5a6169fe6d8f845591860791f3b2f782af. --- pandas/core/dtypes/missing.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index e9f0e23ea966d..92e1b17c41694 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -223,13 +223,15 @@ def _isna_ndarraylike(obj, old: bool = False): array-like Array of boolean values denoting the NA status of each element. """ - if not isinstance(obj, np.ndarray): - values = obj.to_numpy() - else: - values = obj + values = getattr(obj, "_values", obj) dtype = values.dtype - if is_string_dtype(dtype): + if is_extension_array_dtype(dtype): + if old: + result = values.isna() | (values == -np.inf) | (values == np.inf) + else: + result = values.isna() + elif is_string_dtype(dtype): result = _isna_string_dtype(values, dtype, old=old) elif needs_i8_conversion(dtype): # this is the NaT pattern From ad058d618ab572e706fab5ec1f48c3f3d9bf99e6 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Tue, 28 Apr 2020 17:35:16 -0500 Subject: [PATCH 06/10] Try libmissing --- pandas/core/dtypes/missing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 92e1b17c41694..6f7b6e3bdf732 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -228,7 +228,7 @@ def _isna_ndarraylike(obj, old: bool = False): if is_extension_array_dtype(dtype): if old: - result = values.isna() | (values == -np.inf) | (values == np.inf) + result = libmissing.isnaobj_old(values.to_numpy()) else: result = values.isna() elif is_string_dtype(dtype): From e80d10039b97c1b1ddb17d37c0bd7a04fc0c0250 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Tue, 28 Apr 2020 18:35:48 -0500 Subject: [PATCH 07/10] Update test --- pandas/tests/series/test_missing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index a64a6bc584cf6..0b0684bb61f3b 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -509,12 +509,12 @@ def test_fillna_nat(self): tm.assert_frame_equal(filled2, expected) def test_isna_for_inf(self): - s = Series(["a", np.inf, np.nan, 1.0]) + s = Series(["a", np.inf, np.nan, pd.NA, 1.0]) with pd.option_context("mode.use_inf_as_na", True): r = s.isna() dr = s.dropna() - e = Series([False, True, True, False]) - de = Series(["a", 1.0], index=[0, 3]) + e = Series([False, True, True, True, False]) + de = Series(["a", 1.0], index=[0, 4]) tm.assert_series_equal(r, e) tm.assert_series_equal(dr, de) From d5bf7fc27b5b89390fc14de52f76fed02fca8309 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Tue, 28 Apr 2020 21:15:16 -0500 Subject: [PATCH 08/10] Fix --- pandas/tests/arrays/string_/test_string.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 46182e3168f99..6d5cc09793db9 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -312,7 +312,7 @@ def test_value_counts_na(): (pd.array(["a", "b", None]), np.array([False, False, True])), ], ) -def test_use_na_as_inf(values, expected): +def test_use_inf_as_na(values, expected): # https://github.com/pandas-dev/pandas/issues/33655 with pd.option_context("mode.use_inf_as_na", True): result = values.isna() From 1b813b01843abe40c6f7ee752bc55cf0bd361294 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 29 Apr 2020 12:01:50 -0500 Subject: [PATCH 09/10] Add base test --- pandas/tests/extension/base/missing.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 2393d2edcd2c6..a5969ef961bab 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -127,3 +127,10 @@ def test_fillna_fill_other(self, data): expected = pd.DataFrame({"A": data, "B": [0.0] * len(result)}) self.assert_frame_equal(result, expected) + + def test_use_inf_as_na_no_effect(self, data_missing): + ser = pd.Series(data_missing) + expected = ser.isna() + with pd.option_context("mode.use_inf_as_na", True): + result = ser.isna() + self.assert_series_equal(result, expected) From 4b3b894ada902b47dc6d1b690b5f9b4f6620033f Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 10 May 2020 09:57:42 -0500 Subject: [PATCH 10/10] Special case categorical --- pandas/core/dtypes/missing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index a5b94585e4ea6..ab8df492f1c01 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -17,6 +17,7 @@ TD64NS_DTYPE, ensure_object, is_bool_dtype, + is_categorical_dtype, is_complex_dtype, is_datetimelike_v_numeric, is_dtype_equal, @@ -209,7 +210,7 @@ def _isna_ndarraylike(obj, inf_as_na: bool = False): dtype = values.dtype if is_extension_array_dtype(dtype): - if inf_as_na: + if inf_as_na and is_categorical_dtype(dtype): result = libmissing.isnaobj_old(values.to_numpy()) else: result = values.isna()