Backport PR pandas-dev#56123 on branch 2.1.x (BUG: ne comparison returns False for NA and other value) (pandas-dev#56382)

phofl · web-flow · commit aacdf613a897 · 2023-12-07T22:19:15.000+01:00
diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst
@@ -30,6 +30,7 @@ Bug fixes
 - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
 - Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`)
 - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
+- Fixed bug in :meth:`Series.__ne__` resulting in False for comparison between ``NA`` and string value for ``dtype="string[pyarrow_numpy]"`` (:issue:`56122`)
 - Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`)
 - Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`)
 -
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from functools import partial
+import operator
 import re
 from typing import (
     TYPE_CHECKING,
@@ -600,7 +601,10 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
 
     def _cmp_method(self, other, op):
         result = super()._cmp_method(other, op)
-        return result.to_numpy(np.bool_, na_value=False)
+        if op == operator.ne:
+            return result.to_numpy(np.bool_, na_value=True)
+        else:
+            return result.to_numpy(np.bool_, na_value=False)
 
     def value_counts(self, dropna: bool = True):
         from pandas import Series
diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py
@@ -8,10 +8,13 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
 from pandas import (
     Series,
     Timestamp,
+    option_context,
 )
 import pandas._testing as tm
 from pandas.core import ops
@@ -31,20 +34,24 @@ def test_comparison_object_numeric_nas(self, comparison_op):
         expected = func(ser.astype(float), shifted.astype(float))
         tm.assert_series_equal(result, expected)
 
-    def test_object_comparisons(self):
-        ser = Series(["a", "b", np.nan, "c", "a"])
+    @pytest.mark.parametrize(
+        "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
+    )
+    def test_object_comparisons(self, infer_string):
+        with option_context("future.infer_string", infer_string):
+            ser = Series(["a", "b", np.nan, "c", "a"])
 
-        result = ser == "a"
-        expected = Series([True, False, False, False, True])
-        tm.assert_series_equal(result, expected)
+            result = ser == "a"
+            expected = Series([True, False, False, False, True])
+            tm.assert_series_equal(result, expected)
 
-        result = ser < "a"
-        expected = Series([False, False, False, False, False])
-        tm.assert_series_equal(result, expected)
+            result = ser < "a"
+            expected = Series([False, False, False, False, False])
+            tm.assert_series_equal(result, expected)
 
-        result = ser != "a"
-        expected = -(ser == "a")
-        tm.assert_series_equal(result, expected)
+            result = ser != "a"
+            expected = -(ser == "a")
+            tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize("dtype", [None, object])
     def test_more_na_comparisons(self, dtype):
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -2,6 +2,8 @@
 This module tests the functionality of StringArray and ArrowStringArray.
 Tests for the str accessors are in pandas/tests/strings/test_string_array.py
 """
+import operator
+
 import numpy as np
 import pytest
 
@@ -221,7 +223,10 @@ def test_comparison_methods_scalar(comparison_op, dtype):
     result = getattr(a, op_name)(other)
     if dtype.storage == "pyarrow_numpy":
         expected = np.array([getattr(item, op_name)(other) for item in a])
-        expected[1] = False
+        if comparison_op == operator.ne:
+            expected[1] = True
+        else:
+            expected[1] = False
         tm.assert_numpy_array_equal(result, expected.astype(np.bool_))
     else:
         expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
@@ -236,7 +241,10 @@ def test_comparison_methods_scalar_pd_na(comparison_op, dtype):
     result = getattr(a, op_name)(pd.NA)
 
     if dtype.storage == "pyarrow_numpy":
-        expected = np.array([False, False, False])
+        if operator.ne == comparison_op:
+            expected = np.array([True, True, True])
+        else:
+            expected = np.array([False, False, False])
         tm.assert_numpy_array_equal(result, expected)
     else:
         expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
@@ -262,7 +270,7 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype):
     if dtype.storage == "pyarrow_numpy":
         expected_data = {
             "__eq__": [False, False, False],
-            "__ne__": [True, False, True],
+            "__ne__": [True, True, True],
         }[op_name]
         expected = np.array(expected_data)
         tm.assert_numpy_array_equal(result, expected)
@@ -282,12 +290,18 @@ def test_comparison_methods_array(comparison_op, dtype):
     other = [None, None, "c"]
     result = getattr(a, op_name)(other)
     if dtype.storage == "pyarrow_numpy":
-        expected = np.array([False, False, False])
-        expected[-1] = getattr(other[-1], op_name)(a[-1])
+        if operator.ne == comparison_op:
+            expected = np.array([True, True, False])
+        else:
+            expected = np.array([False, False, False])
+            expected[-1] = getattr(other[-1], op_name)(a[-1])
         tm.assert_numpy_array_equal(result, expected)
 
         result = getattr(a, op_name)(pd.NA)
-        expected = np.array([False, False, False])
+        if operator.ne == comparison_op:
+            expected = np.array([True, True, True])
+        else:
+            expected = np.array([False, False, False])
         tm.assert_numpy_array_equal(result, expected)
 
     else:

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@ Bug fixes`
`30`	`30`	- Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
`31`	`31`	- Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`)
`32`	`32`	- Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`)
	`33`	+- Fixed bug in :meth:`Series.__ne__` resulting in False for comparison between ``NA`` and string value for ``dtype="string[pyarrow_numpy]"`` (:issue:`56122`)
`33`	`34`	- Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`)
`34`	`35`	- Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`)
`35`	`36`	`-`