From 077d136fe72682d243cb5d0427d368b4543631fc Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Thu, 2 Aug 2018 09:48:15 -0400 Subject: [PATCH 01/29] BUG: fixed .str.contains(..., na=False) for categorical series --- pandas/core/strings.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index b98fa106336fc..6cfa1ef96d835 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1726,6 +1726,7 @@ def _wrap_result(self, result, use_codes=True, # before the transformation... if use_codes and self._is_categorical: result = take_1d(result, self._orig.cat.codes) + result[isna(result)] = False if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): return result From 2ae44d1b01b677117e0aa97ca19d9f4575263e0d Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Thu, 2 Aug 2018 11:34:39 -0400 Subject: [PATCH 02/29] na argument for _wrap_results --- pandas/core/strings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 6cfa1ef96d835..3ce6131c41bd9 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1715,7 +1715,7 @@ def __iter__(self): g = self.get(i) def _wrap_result(self, result, use_codes=True, - name=None, expand=None): + name=None, expand=None, na=np.nan): from pandas.core.index import Index, MultiIndex @@ -1726,7 +1726,7 @@ def _wrap_result(self, result, use_codes=True, # before the transformation... if use_codes and self._is_categorical: result = take_1d(result, self._orig.cat.codes) - result[isna(result)] = False + result[isna(result)] = na if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): return result @@ -1890,7 +1890,7 @@ def join(self, sep): def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): result = str_contains(self._data, pat, case=case, flags=flags, na=na, regex=regex) - return self._wrap_result(result) + return self._wrap_result(result, na=na) @copy(str_match) def match(self, pat, case=True, flags=0, na=np.nan, as_indexer=None): From a1b3d7bd0691aa07a73c1150ee3a382435551f64 Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Fri, 17 Aug 2018 14:47:31 -0400 Subject: [PATCH 03/29] fixed str.contains for missing values --- pandas/core/strings.py | 7 ++++++- pandas/tests/test_strings.py | 7 +++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 3ce6131c41bd9..5ea440e6479f9 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1726,7 +1726,12 @@ def _wrap_result(self, result, use_codes=True, # before the transformation... if use_codes and self._is_categorical: result = take_1d(result, self._orig.cat.codes) - result[isna(result)] = na + missing = isna(result) + + if missing.any(): + result_type = np.result_type(result, na) + result = result.astype(result_type, copy=False) + result[isna(result)] = na if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): return result diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index aa94b992facfc..165705b9478ba 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -228,6 +228,13 @@ def test_contains(self): res = values.str.contains('foo', na="foo") assert res.loc[2] == "foo" + # category + values = Series(["a","b","c","a", np.nan], dtype="category") + result = values.str.contains('a', na=True) + expected = Series([True, False, False, True, True], dtype=np.object_) + assert isinstance(result, Series) + tm.assert_series_equal(result, expected) + def test_startswith(self): values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) From dbd990bfa953cb60524c28443c9bfaaeeed8ebb3 Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Fri, 17 Aug 2018 15:00:35 -0400 Subject: [PATCH 04/29] PEP8 Issue: added whitespace after ',' --- pandas/tests/test_strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 165705b9478ba..48edb23cec3b6 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -229,7 +229,7 @@ def test_contains(self): assert res.loc[2] == "foo" # category - values = Series(["a","b","c","a", np.nan], dtype="category") + values = Series(["a", "b", "c", "a", np.nan], dtype="category") result = values.str.contains('a', na=True) expected = Series([True, False, False, True, True], dtype=np.object_) assert isinstance(result, Series) From 90aef7bc23b20bd83486c84b617da32ba82f13ac Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Fri, 24 Aug 2018 14:32:40 -0400 Subject: [PATCH 05/29] na argument for wrap_results in match --- pandas/core/strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 06fd1391e1345..9e9f0a378759d 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2478,7 +2478,7 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): def match(self, pat, case=True, flags=0, na=np.nan, as_indexer=None): result = str_match(self._parent, pat, case=case, flags=flags, na=na, as_indexer=as_indexer) - return self._wrap_result(result) + return self._wrap_result(result, na=na) @copy(str_replace) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): From 69f16af364f04f37a6573576e6bc1cd77c3228a9 Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Thu, 2 Aug 2018 09:48:15 -0400 Subject: [PATCH 06/29] BUG: fixed .str.contains(..., na=False) for categorical series --- pandas/core/strings.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 07e744a6284ef..dc08b453b2313 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1855,6 +1855,7 @@ def _wrap_result(self, result, use_codes=True, # before the transformation... if use_codes and self._is_categorical: result = take_1d(result, self._orig.cat.codes) + result[isna(result)] = False if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): return result From 78cf8c7a8943e1702d0834a7a4e75a8f24e9b5cc Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Thu, 2 Aug 2018 11:34:39 -0400 Subject: [PATCH 07/29] na argument for _wrap_results --- pandas/core/strings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index dc08b453b2313..8097ed1961163 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1844,7 +1844,7 @@ def __iter__(self): g = self.get(i) def _wrap_result(self, result, use_codes=True, - name=None, expand=None): + name=None, expand=None, na=np.nan): from pandas.core.index import Index, MultiIndex @@ -1855,7 +1855,7 @@ def _wrap_result(self, result, use_codes=True, # before the transformation... if use_codes and self._is_categorical: result = take_1d(result, self._orig.cat.codes) - result[isna(result)] = False + result[isna(result)] = na if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): return result @@ -2467,7 +2467,7 @@ def join(self, sep): def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): result = str_contains(self._parent, pat, case=case, flags=flags, na=na, regex=regex) - return self._wrap_result(result) + return self._wrap_result(result, na=na) @copy(str_match) def match(self, pat, case=True, flags=0, na=np.nan, as_indexer=None): From 93bb24a51c42f8c2326fa51b7a193352b4b1b79e Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Fri, 17 Aug 2018 14:47:31 -0400 Subject: [PATCH 08/29] fixed str.contains for missing values --- pandas/core/strings.py | 7 ++++++- pandas/tests/test_strings.py | 7 +++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 8097ed1961163..06fd1391e1345 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1855,7 +1855,12 @@ def _wrap_result(self, result, use_codes=True, # before the transformation... if use_codes and self._is_categorical: result = take_1d(result, self._orig.cat.codes) - result[isna(result)] = na + missing = isna(result) + + if missing.any(): + result_type = np.result_type(result, na) + result = result.astype(result_type, copy=False) + result[isna(result)] = na if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): return result diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 9d008dfd25c90..95bbfc59c5978 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -572,6 +572,13 @@ def test_contains(self): res = values.str.contains('foo', na="foo") assert res.loc[2] == "foo" + # category + values = Series(["a","b","c","a", np.nan], dtype="category") + result = values.str.contains('a', na=True) + expected = Series([True, False, False, True, True], dtype=np.object_) + assert isinstance(result, Series) + tm.assert_series_equal(result, expected) + def test_startswith(self): values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) From 6c2700f0d2c9fbbbf6741e62cdaca01e737afd53 Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Fri, 17 Aug 2018 15:00:35 -0400 Subject: [PATCH 09/29] PEP8 Issue: added whitespace after ',' --- pandas/tests/test_strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 95bbfc59c5978..7d05f89db4ed5 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -573,7 +573,7 @@ def test_contains(self): assert res.loc[2] == "foo" # category - values = Series(["a","b","c","a", np.nan], dtype="category") + values = Series(["a", "b", "c", "a", np.nan], dtype="category") result = values.str.contains('a', na=True) expected = Series([True, False, False, True, True], dtype=np.object_) assert isinstance(result, Series) From 66491291f575bec91ea5e76f805c21b928746959 Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Fri, 24 Aug 2018 14:32:40 -0400 Subject: [PATCH 10/29] na argument for wrap_results in match --- pandas/core/strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 06fd1391e1345..9e9f0a378759d 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2478,7 +2478,7 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): def match(self, pat, case=True, flags=0, na=np.nan, as_indexer=None): result = str_match(self._parent, pat, case=case, flags=flags, na=na, as_indexer=as_indexer) - return self._wrap_result(result) + return self._wrap_result(result, na=na) @copy(str_replace) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): From 3abdea5bb35f7d662e2c5d28bb2f6f19c2beccbe Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Sat, 22 Sep 2018 15:27:08 -0400 Subject: [PATCH 11/29] Update circle-27-compat.yaml --- ci/circle-27-compat.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/circle-27-compat.yaml b/ci/circle-27-compat.yaml index 5e9842f4742c5..b5be569eb28a4 100644 --- a/ci/circle-27-compat.yaml +++ b/ci/circle-27-compat.yaml @@ -7,7 +7,7 @@ dependencies: - cython=0.28.2 - jinja2=2.8 - numexpr=2.4.4 # we test that we correctly don't use an unsupported numexpr - - numpy=1.9.3 + - numpy=1.9.2 - openpyxl - psycopg2 - pytables=3.2.2 From d1365993d1816462475dcd384c6901c6d4515adc Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Sat, 22 Sep 2018 15:59:41 -0400 Subject: [PATCH 12/29] Update travis-27-locale.yaml --- ci/travis-27-locale.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/travis-27-locale.yaml b/ci/travis-27-locale.yaml index 73ab424329463..78cbe8f59a8e0 100644 --- a/ci/travis-27-locale.yaml +++ b/ci/travis-27-locale.yaml @@ -7,7 +7,7 @@ dependencies: - cython=0.28.2 - lxml - matplotlib=1.4.3 - - numpy=1.9.3 + - numpy=1.9.2 - openpyxl=2.4.0 - python-dateutil - python-blosc From 82f9b9e83b8f6ab0178c2cc1f9cc9c1a29815ff0 Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Sat, 22 Sep 2018 16:37:37 -0400 Subject: [PATCH 13/29] added tests for na arg for categorical and objects --- pandas/tests/test_strings.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 7d05f89db4ed5..2a0e6231a5c34 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -572,12 +572,24 @@ def test_contains(self): res = values.str.contains('foo', na="foo") assert res.loc[2] == "foo" - # category + # na for category values = Series(["a", "b", "c", "a", np.nan], dtype="category") result = values.str.contains('a', na=True) expected = Series([True, False, False, True, True], dtype=np.object_) assert isinstance(result, Series) tm.assert_series_equal(result, expected) + result = values.str.contains('a', na=False) + expected = Series([True, False, False, True, False], dtype=np.object_) + tm.assert_series_equal(result, expected) + + # na for objects + values = Series(["a", "b", "c", "a", np.nan]) + result = values.str.contains('a', na=True) + expected = Series([True, False, False, True, True], dtype=np.object_) + tm.assert_series_equal(result, expected) + result = values.str.contains('a', na=False) + expected = Series([True, False, False, True, False], dtype=np.object_) + tm.assert_series_equal(result, expected) def test_startswith(self): values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) From 53e925350829297627ea5d8d101111d3acad1439 Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Sun, 23 Sep 2018 07:20:57 -0400 Subject: [PATCH 14/29] updated _wrap_results with arg fill_value and removed na --- pandas/core/strings.py | 14 ++++---------- pandas/tests/test_strings.py | 15 ++++++--------- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 9e9f0a378759d..42e783de62354 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1844,7 +1844,7 @@ def __iter__(self): g = self.get(i) def _wrap_result(self, result, use_codes=True, - name=None, expand=None, na=np.nan): + name=None, expand=None, fill_value=np.nan): from pandas.core.index import Index, MultiIndex @@ -1854,13 +1854,7 @@ def _wrap_result(self, result, use_codes=True, # so make it possible to skip this step as the method already did this # before the transformation... if use_codes and self._is_categorical: - result = take_1d(result, self._orig.cat.codes) - missing = isna(result) - - if missing.any(): - result_type = np.result_type(result, na) - result = result.astype(result_type, copy=False) - result[isna(result)] = na + result = take_1d(result, self._orig.cat.codes, fill_value=fill_value) if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): return result @@ -2472,13 +2466,13 @@ def join(self, sep): def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): result = str_contains(self._parent, pat, case=case, flags=flags, na=na, regex=regex) - return self._wrap_result(result, na=na) + return self._wrap_result(result, fill_value=na) @copy(str_match) def match(self, pat, case=True, flags=0, na=np.nan, as_indexer=None): result = str_match(self._parent, pat, case=case, flags=flags, na=na, as_indexer=as_indexer) - return self._wrap_result(result, na=na) + return self._wrap_result(result, fill_value=na) @copy(str_replace) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 2a0e6231a5c34..479ea7f9e1a47 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -567,27 +567,24 @@ def test_contains(self): assert result.dtype == np.bool_ tm.assert_numpy_array_equal(result, expected) - # na - values = Series(['om', 'foo', np.nan]) - res = values.str.contains('foo', na="foo") - assert res.loc[2] == "foo" - # na for category values = Series(["a", "b", "c", "a", np.nan], dtype="category") - result = values.str.contains('a', na=True) + result = values.str.contains('a', na=True).astype(object) expected = Series([True, False, False, True, True], dtype=np.object_) assert isinstance(result, Series) tm.assert_series_equal(result, expected) - result = values.str.contains('a', na=False) + + result = values.str.contains('a', na=False).astype(object) expected = Series([True, False, False, True, False], dtype=np.object_) tm.assert_series_equal(result, expected) # na for objects values = Series(["a", "b", "c", "a", np.nan]) - result = values.str.contains('a', na=True) + result = values.str.contains('a', na=True).astype(object) expected = Series([True, False, False, True, True], dtype=np.object_) tm.assert_series_equal(result, expected) - result = values.str.contains('a', na=False) + + result = values.str.contains('a', na=False).astype(object) expected = Series([True, False, False, True, False], dtype=np.object_) tm.assert_series_equal(result, expected) From ffa9969606a987280db7839d5a194f0a62051ab3 Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Thu, 2 Aug 2018 09:48:15 -0400 Subject: [PATCH 15/29] BUG: fixed .str.contains(..., na=False) for categorical series --- pandas/core/strings.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index ed091ce4956bc..2a47b914fb901 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1873,6 +1873,7 @@ def _wrap_result(self, result, use_codes=True, # before the transformation... if use_codes and self._is_categorical: result = take_1d(result, self._orig.cat.codes) + result[isna(result)] = False if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): return result From 07c1d73d249cd403d265654933745e2f9c358a2e Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Thu, 2 Aug 2018 11:34:39 -0400 Subject: [PATCH 16/29] na argument for _wrap_results --- pandas/core/strings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 2a47b914fb901..63136b1b4645d 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1862,7 +1862,7 @@ def __iter__(self): g = self.get(i) def _wrap_result(self, result, use_codes=True, - name=None, expand=None): + name=None, expand=None, na=np.nan): from pandas.core.index import Index, MultiIndex @@ -1873,7 +1873,7 @@ def _wrap_result(self, result, use_codes=True, # before the transformation... if use_codes and self._is_categorical: result = take_1d(result, self._orig.cat.codes) - result[isna(result)] = False + result[isna(result)] = na if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): return result @@ -2500,7 +2500,7 @@ def join(self, sep): def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): result = str_contains(self._parent, pat, case=case, flags=flags, na=na, regex=regex) - return self._wrap_result(result) + return self._wrap_result(result, na=na) @copy(str_match) def match(self, pat, case=True, flags=0, na=np.nan): From 7f1f2e2195c4f878d8742a9c084f815767ba0cf7 Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Fri, 17 Aug 2018 14:47:31 -0400 Subject: [PATCH 17/29] fixed str.contains for missing values --- pandas/core/strings.py | 7 ++++++- pandas/tests/test_strings.py | 7 +++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 63136b1b4645d..2c8a7f16230e1 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1873,7 +1873,12 @@ def _wrap_result(self, result, use_codes=True, # before the transformation... if use_codes and self._is_categorical: result = take_1d(result, self._orig.cat.codes) - result[isna(result)] = na + missing = isna(result) + + if missing.any(): + result_type = np.result_type(result, na) + result = result.astype(result_type, copy=False) + result[isna(result)] = na if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): return result diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index bd450cdcf8054..4649d5e70f955 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -548,6 +548,13 @@ def test_contains(self): res = values.str.contains('foo', na="foo") assert res.loc[2] == "foo" + # category + values = Series(["a","b","c","a", np.nan], dtype="category") + result = values.str.contains('a', na=True) + expected = Series([True, False, False, True, True], dtype=np.object_) + assert isinstance(result, Series) + tm.assert_series_equal(result, expected) + def test_startswith(self): values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) From f6cb04feff5e306d505a202e3466c14aa592ef8c Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Fri, 17 Aug 2018 15:00:35 -0400 Subject: [PATCH 18/29] PEP8 Issue: added whitespace after ',' --- pandas/tests/test_strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 4649d5e70f955..000d3a4133330 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -549,7 +549,7 @@ def test_contains(self): assert res.loc[2] == "foo" # category - values = Series(["a","b","c","a", np.nan], dtype="category") + values = Series(["a", "b", "c", "a", np.nan], dtype="category") result = values.str.contains('a', na=True) expected = Series([True, False, False, True, True], dtype=np.object_) assert isinstance(result, Series) From 1f0256a879af67f3603fdc0b14dad14496c05bd4 Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Fri, 24 Aug 2018 14:32:40 -0400 Subject: [PATCH 19/29] na argument for wrap_results in match --- pandas/core/strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 2c8a7f16230e1..70d85c4a893ee 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2510,7 +2510,7 @@ def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): @copy(str_match) def match(self, pat, case=True, flags=0, na=np.nan): result = str_match(self._parent, pat, case=case, flags=flags, na=na) - return self._wrap_result(result) + return self._wrap_result(result, na=na) @copy(str_replace) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): From 7025f346acd1caef13901def568a57722cecd6ad Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Sat, 22 Sep 2018 15:59:41 -0400 Subject: [PATCH 20/29] Update travis-27-locale.yaml --- ci/travis-27-locale.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/travis-27-locale.yaml b/ci/travis-27-locale.yaml index aca65f27d4187..f08d69efaac6c 100644 --- a/ci/travis-27-locale.yaml +++ b/ci/travis-27-locale.yaml @@ -7,7 +7,7 @@ dependencies: - cython=0.28.2 - lxml - matplotlib=1.4.3 - - numpy=1.9.3 + - numpy=1.9.2 - openpyxl=2.4.0 - python-dateutil - python-blosc From 754244812eca20870992d5146a2314c807426cb2 Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Sat, 22 Sep 2018 16:37:37 -0400 Subject: [PATCH 21/29] added tests for na arg for categorical and objects --- pandas/tests/test_strings.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 000d3a4133330..b8d7a9d5c9eb3 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -548,12 +548,24 @@ def test_contains(self): res = values.str.contains('foo', na="foo") assert res.loc[2] == "foo" - # category + # na for category values = Series(["a", "b", "c", "a", np.nan], dtype="category") result = values.str.contains('a', na=True) expected = Series([True, False, False, True, True], dtype=np.object_) assert isinstance(result, Series) tm.assert_series_equal(result, expected) + result = values.str.contains('a', na=False) + expected = Series([True, False, False, True, False], dtype=np.object_) + tm.assert_series_equal(result, expected) + + # na for objects + values = Series(["a", "b", "c", "a", np.nan]) + result = values.str.contains('a', na=True) + expected = Series([True, False, False, True, True], dtype=np.object_) + tm.assert_series_equal(result, expected) + result = values.str.contains('a', na=False) + expected = Series([True, False, False, True, False], dtype=np.object_) + tm.assert_series_equal(result, expected) def test_startswith(self): values = Series(['om', NA, 'foo_nom', 'nom', 'bar_foo', NA, 'foo']) From f1b4274e4b68a29cc635202e4be665440098d90c Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Sun, 23 Sep 2018 07:20:57 -0400 Subject: [PATCH 22/29] updated _wrap_results with arg fill_value and removed na --- pandas/core/strings.py | 14 ++++---------- pandas/tests/test_strings.py | 15 ++++++--------- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 70d85c4a893ee..c5a8020ac461c 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1862,7 +1862,7 @@ def __iter__(self): g = self.get(i) def _wrap_result(self, result, use_codes=True, - name=None, expand=None, na=np.nan): + name=None, expand=None, fill_value=np.nan): from pandas.core.index import Index, MultiIndex @@ -1872,13 +1872,7 @@ def _wrap_result(self, result, use_codes=True, # so make it possible to skip this step as the method already did this # before the transformation... if use_codes and self._is_categorical: - result = take_1d(result, self._orig.cat.codes) - missing = isna(result) - - if missing.any(): - result_type = np.result_type(result, na) - result = result.astype(result_type, copy=False) - result[isna(result)] = na + result = take_1d(result, self._orig.cat.codes, fill_value=fill_value) if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): return result @@ -2505,12 +2499,12 @@ def join(self, sep): def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): result = str_contains(self._parent, pat, case=case, flags=flags, na=na, regex=regex) - return self._wrap_result(result, na=na) + return self._wrap_result(result, fill_value=na) @copy(str_match) def match(self, pat, case=True, flags=0, na=np.nan): result = str_match(self._parent, pat, case=case, flags=flags, na=na) - return self._wrap_result(result, na=na) + return self._wrap_result(result, fill_value=na) @copy(str_replace) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index b8d7a9d5c9eb3..37d57a56510e1 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -543,27 +543,24 @@ def test_contains(self): assert result.dtype == np.bool_ tm.assert_numpy_array_equal(result, expected) - # na - values = Series(['om', 'foo', np.nan]) - res = values.str.contains('foo', na="foo") - assert res.loc[2] == "foo" - # na for category values = Series(["a", "b", "c", "a", np.nan], dtype="category") - result = values.str.contains('a', na=True) + result = values.str.contains('a', na=True).astype(object) expected = Series([True, False, False, True, True], dtype=np.object_) assert isinstance(result, Series) tm.assert_series_equal(result, expected) - result = values.str.contains('a', na=False) + + result = values.str.contains('a', na=False).astype(object) expected = Series([True, False, False, True, False], dtype=np.object_) tm.assert_series_equal(result, expected) # na for objects values = Series(["a", "b", "c", "a", np.nan]) - result = values.str.contains('a', na=True) + result = values.str.contains('a', na=True).astype(object) expected = Series([True, False, False, True, True], dtype=np.object_) tm.assert_series_equal(result, expected) - result = values.str.contains('a', na=False) + + result = values.str.contains('a', na=False).astype(object) expected = Series([True, False, False, True, False], dtype=np.object_) tm.assert_series_equal(result, expected) From 7a09c4422ab35a64dc3d222c826c9b8e74e4d73e Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Sun, 23 Sep 2018 21:01:05 -0400 Subject: [PATCH 23/29] Update travis-27-locale.yaml --- ci/travis-27-locale.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/travis-27-locale.yaml b/ci/travis-27-locale.yaml index f08d69efaac6c..aca65f27d4187 100644 --- a/ci/travis-27-locale.yaml +++ b/ci/travis-27-locale.yaml @@ -7,7 +7,7 @@ dependencies: - cython=0.28.2 - lxml - matplotlib=1.4.3 - - numpy=1.9.2 + - numpy=1.9.3 - openpyxl=2.4.0 - python-dateutil - python-blosc From 3408920da4f0db3de9ca025742a3d739dd61dd1f Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Sun, 23 Sep 2018 21:15:04 -0400 Subject: [PATCH 24/29] fixed line too long --- pandas/core/strings.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index c5a8020ac461c..cc92ada775d7b 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1872,7 +1872,8 @@ def _wrap_result(self, result, use_codes=True, # so make it possible to skip this step as the method already did this # before the transformation... if use_codes and self._is_categorical: - result = take_1d(result, self._orig.cat.codes, fill_value=fill_value) + result = take_1d(result, self._orig.cat.codes, + fill_value=fill_value) if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): return result From 3288d110096cc1af2b3e9bbcfa8864f9d1eede05 Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Tue, 25 Sep 2018 10:49:57 -0400 Subject: [PATCH 25/29] whatsnew note --- doc/source/whatsnew/v0.24.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index ed1bf0a4f8394..96b9a3395b3b8 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -704,7 +704,7 @@ Numeric Strings ^^^^^^^ -- +- Bug in :class:`StringMethods` where `str_contains()` was not filling missing values with given argument for na for a categorical Series (:issue:`22158`) - - From 6c87770e76a558855327f1401548ee7b6da7d161 Mon Sep 17 00:00:00 2001 From: Pulkit Maloo Date: Tue, 25 Sep 2018 13:22:33 -0400 Subject: [PATCH 26/29] Update v0.24.0.txt --- doc/source/whatsnew/v0.24.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 96b9a3395b3b8..733127e92093a 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -704,7 +704,7 @@ Numeric Strings ^^^^^^^ -- Bug in :class:`StringMethods` where `str_contains()` was not filling missing values with given argument for na for a categorical Series (:issue:`22158`) +- Bug in :class:`StringMethods` where :func:`Series.str.contains` was not filling missing values with given argument for na for a categorical ``Series`` (:issue:`22158`) - - From d242647b15c096478afd1b2c0ac2149be53205c0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 18 Oct 2018 20:38:32 -0400 Subject: [PATCH 27/29] Update doc/source/whatsnew/v0.24.0.txt Co-Authored-By: pulkitmaloo --- doc/source/whatsnew/v0.24.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 733127e92093a..ad5aa0de9f7de 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -704,7 +704,7 @@ Numeric Strings ^^^^^^^ -- Bug in :class:`StringMethods` where :func:`Series.str.contains` was not filling missing values with given argument for na for a categorical ``Series`` (:issue:`22158`) +- Bug :func:`Series.str.contains` not respecting the ``na`` argument for a Categorica dtype ``Series`` (:issue:`22158`) - - From fd994313e18dabc77bfb74bbe8da1baa89c6e2e1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 18 Nov 2018 17:11:40 -0500 Subject: [PATCH 28/29] whatsnew --- doc/source/whatsnew/v0.24.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index d6f9bb66e1e28..89404fc4305f8 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1272,7 +1272,7 @@ Strings - Bug in :meth:`Index.str.partition` was not nan-safe (:issue:`23558`). - Bug in :meth:`Index.str.split` was not nan-safe (:issue:`23677`). -- +- Bug :func:`Series.str.contains` not respecting the ``na`` argument for a ``Categorical`` dtype ``Series`` (:issue:`22158`) Interval ^^^^^^^^ From 44b36a4eb1326676331f037df698fcdccdeb94ec Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 18 Nov 2018 17:17:41 -0500 Subject: [PATCH 29/29] cleanup --- pandas/tests/test_strings.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 053f590eecb47..2ff63b67d1202 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -512,25 +512,27 @@ def test_contains(self): assert result.dtype == np.bool_ tm.assert_numpy_array_equal(result, expected) + def test_contains_for_object_category(self): + # gh 22158 + # na for category values = Series(["a", "b", "c", "a", np.nan], dtype="category") - result = values.str.contains('a', na=True).astype(object) - expected = Series([True, False, False, True, True], dtype=np.object_) - assert isinstance(result, Series) + result = values.str.contains('a', na=True) + expected = Series([True, False, False, True, True]) tm.assert_series_equal(result, expected) - result = values.str.contains('a', na=False).astype(object) - expected = Series([True, False, False, True, False], dtype=np.object_) + result = values.str.contains('a', na=False) + expected = Series([True, False, False, True, False]) tm.assert_series_equal(result, expected) # na for objects values = Series(["a", "b", "c", "a", np.nan]) - result = values.str.contains('a', na=True).astype(object) - expected = Series([True, False, False, True, True], dtype=np.object_) + result = values.str.contains('a', na=True) + expected = Series([True, False, False, True, True]) tm.assert_series_equal(result, expected) - result = values.str.contains('a', na=False).astype(object) - expected = Series([True, False, False, True, False], dtype=np.object_) + result = values.str.contains('a', na=False) + expected = Series([True, False, False, True, False]) tm.assert_series_equal(result, expected) def test_startswith(self): @@ -2891,7 +2893,7 @@ def test_get_complex_nested(self, to_type): expected = Series([np.nan]) tm.assert_series_equal(result, expected) - def test_more_contains(self): + def test_contains_moar(self): # PR #1179 s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, 'CABA', 'dog', 'cat']) @@ -2941,7 +2943,7 @@ def test_contains_nan(self): expected = Series([np.nan, np.nan, np.nan], dtype=np.object_) assert_series_equal(result, expected) - def test_more_replace(self): + def test_replace_moar(self): # PR #1179 s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, 'CABA', 'dog', 'cat'])