diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index d6f9bb66e1e28..89404fc4305f8 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1272,7 +1272,7 @@ Strings - Bug in :meth:`Index.str.partition` was not nan-safe (:issue:`23558`). - Bug in :meth:`Index.str.split` was not nan-safe (:issue:`23677`). -- +- Bug :func:`Series.str.contains` not respecting the ``na`` argument for a ``Categorical`` dtype ``Series`` (:issue:`22158`) Interval ^^^^^^^^ diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 58ce562d03d1d..0d89a97b69046 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1857,7 +1857,7 @@ def __iter__(self): g = self.get(i) def _wrap_result(self, result, use_codes=True, - name=None, expand=None): + name=None, expand=None, fill_value=np.nan): from pandas.core.index import Index, MultiIndex @@ -1867,7 +1867,8 @@ def _wrap_result(self, result, use_codes=True, # so make it possible to skip this step as the method already did this # before the transformation... if use_codes and self._is_categorical: - result = take_1d(result, self._orig.cat.codes) + result = take_1d(result, self._orig.cat.codes, + fill_value=fill_value) if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): return result @@ -2515,12 +2516,12 @@ def join(self, sep): def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): result = str_contains(self._parent, pat, case=case, flags=flags, na=na, regex=regex) - return self._wrap_result(result) + return self._wrap_result(result, fill_value=na) @copy(str_match) def match(self, pat, case=True, flags=0, na=np.nan): result = str_match(self._parent, pat, case=case, flags=flags, na=na) - return self._wrap_result(result) + return self._wrap_result(result, fill_value=na) @copy(str_replace) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 42f0cebea83a0..2ff63b67d1202 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -512,10 +512,28 @@ def test_contains(self): assert result.dtype == np.bool_ tm.assert_numpy_array_equal(result, expected) - # na - values = Series(['om', 'foo', np.nan]) - res = values.str.contains('foo', na="foo") - assert res.loc[2] == "foo" + def test_contains_for_object_category(self): + # gh 22158 + + # na for category + values = Series(["a", "b", "c", "a", np.nan], dtype="category") + result = values.str.contains('a', na=True) + expected = Series([True, False, False, True, True]) + tm.assert_series_equal(result, expected) + + result = values.str.contains('a', na=False) + expected = Series([True, False, False, True, False]) + tm.assert_series_equal(result, expected) + + # na for objects + values = Series(["a", "b", "c", "a", np.nan]) + result = values.str.contains('a', na=True) + expected = Series([True, False, False, True, True]) + tm.assert_series_equal(result, expected) + + result = values.str.contains('a', na=False) + expected = Series([True, False, False, True, False]) + tm.assert_series_equal(result, expected) def test_startswith(self): values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) @@ -2875,7 +2893,7 @@ def test_get_complex_nested(self, to_type): expected = Series([np.nan]) tm.assert_series_equal(result, expected) - def test_more_contains(self): + def test_contains_moar(self): # PR #1179 s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, 'CABA', 'dog', 'cat']) @@ -2925,7 +2943,7 @@ def test_contains_nan(self): expected = Series([np.nan, np.nan, np.nan], dtype=np.object_) assert_series_equal(result, expected) - def test_more_replace(self): + def test_replace_moar(self): # PR #1179 s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, 'CABA', 'dog', 'cat'])