From 10552b5a17f2c0cc9458f4f099e0a8d6edb2b6cf Mon Sep 17 00:00:00 2001 From: Markus Meier Date: Sat, 10 Nov 2018 22:03:04 +0100 Subject: [PATCH 1/8] BUG: Index.str.partition not nan-safe (#23558) --- pandas/_libs/lib.pyx | 4 ++-- pandas/tests/test_strings.py | 18 ++++++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a9e0fcbc4a826..78c0ca99f25ea 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2274,7 +2274,7 @@ def to_object_array_tuples(rows: list): k = 0 for i in range(n): - tmp = len(rows[i]) + tmp = 1 if is_float(rows[i]) else len(rows[i]) if tmp > k: k = tmp @@ -2288,7 +2288,7 @@ def to_object_array_tuples(rows: list): except Exception: # upcast any subclasses to tuple for i in range(n): - row = tuple(rows[i]) + row = tuple(rows[i]) if not is_float(rows[i]) else (rows[i],) for j in range(len(row)): result[i, j] = row[j] diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f0873eb7683e9..36d1069515f1e 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2493,7 +2493,7 @@ def test_partition_series(self): ('f g', ' ', 'h')]) tm.assert_series_equal(result, exp) - # Not splited + # Not split values = Series(['abc', 'cde', NA, 'fgh']) result = values.str.partition('_', expand=False) exp = Series([('abc', '', ''), ('cde', '', ''), NA, ('fgh', '', '')]) @@ -2524,28 +2524,30 @@ def test_partition_series(self): assert result == [v.rpartition('_') for v in values] def test_partition_index(self): - values = Index(['a_b_c', 'c_d_e', 'f_g_h']) + values = Index(['a_b_c', 'c_d_e', 'f_g_h', np.nan]) result = values.str.partition('_', expand=False) - exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', - 'g_h')])) + exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), + ('f', '_', 'g_h'), np.nan])) tm.assert_index_equal(result, exp) assert result.nlevels == 1 result = values.str.rpartition('_', expand=False) - exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), ( - 'f_g', '_', 'h')])) + exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), + ('f_g', '_', 'h'), np.nan])) tm.assert_index_equal(result, exp) assert result.nlevels == 1 result = values.str.partition('_') - exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')]) + exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), + ('f', '_', 'g_h'), (np.nan, np.nan, np.nan)]) tm.assert_index_equal(result, exp) assert isinstance(result, MultiIndex) assert result.nlevels == 3 result = values.str.rpartition('_') - exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')]) + exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), + ('f_g', '_', 'h'), (np.nan, np.nan, np.nan)]) tm.assert_index_equal(result, exp) assert isinstance(result, MultiIndex) assert result.nlevels == 3 From a30dc25ca9229c48adeced2ca2eaefeb7d3b1b05 Mon Sep 17 00:00:00 2001 From: Markus Meier Date: Sun, 11 Nov 2018 03:05:30 +0100 Subject: [PATCH 2/8] code review: check directly if value is nan --- pandas/_libs/lib.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 78c0ca99f25ea..4f2166699e393 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2274,7 +2274,7 @@ def to_object_array_tuples(rows: list): k = 0 for i in range(n): - tmp = 1 if is_float(rows[i]) else len(rows[i]) + tmp = 1 if util.is_nan(rows[i]) else len(rows[i]) if tmp > k: k = tmp @@ -2288,7 +2288,7 @@ def to_object_array_tuples(rows: list): except Exception: # upcast any subclasses to tuple for i in range(n): - row = tuple(rows[i]) if not is_float(rows[i]) else (rows[i],) + row = (rows[i],) if util.is_nan(rows[i]) else tuple(rows[i]) for j in range(len(row)): result[i, j] = row[j] From 213e7b6b434c4c508df2e63c3db3346fb7abdb03 Mon Sep 17 00:00:00 2001 From: Markus Meier Date: Sun, 11 Nov 2018 23:17:16 +0100 Subject: [PATCH 3/8] Code review: added whatsnew entry --- doc/source/whatsnew/v0.24.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 5fefb9e3e405c..48a4e13081539 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -1192,7 +1192,7 @@ Numeric Strings ^^^^^^^ -- +- BUG Index.str.partition not nan-safe (:issue:`23558`) - - From f13553f6d4b064bca6c3367edbd961dbb3addd93 Mon Sep 17 00:00:00 2001 From: Markus Meier Date: Mon, 12 Nov 2018 00:41:10 +0100 Subject: [PATCH 4/8] Code review: included suggestes improvements --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/_libs/lib.pyx | 4 +- pandas/tests/test_strings.py | 66 +++++++++++++++++---------------- 3 files changed, 38 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 48a4e13081539..bde315adb2006 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -1192,7 +1192,7 @@ Numeric Strings ^^^^^^^ -- BUG Index.str.partition not nan-safe (:issue:`23558`) +- BUG in :func:`Index.str.partition` is not nan-safe (:issue:`23558`) - - diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4f2166699e393..c87d1f7844f5d 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2274,7 +2274,7 @@ def to_object_array_tuples(rows: list): k = 0 for i in range(n): - tmp = 1 if util.is_nan(rows[i]) else len(rows[i]) + tmp = 1 if checknull(rows[i]) else len(rows[i]) if tmp > k: k = tmp @@ -2288,7 +2288,7 @@ def to_object_array_tuples(rows: list): except Exception: # upcast any subclasses to tuple for i in range(n): - row = (rows[i],) if util.is_nan(rows[i]) else tuple(rows[i]) + row = (rows[i],) if checknull(rows[i]) else tuple(rows[i]) for j in range(len(row)): result[i, j] = row[j] diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 36d1069515f1e..6e32000434c2e 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2457,50 +2457,52 @@ def test_split_with_name(self): tm.assert_index_equal(res, exp) def test_partition_series(self): - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None]) result = values.str.partition('_', expand=False) exp = Series([('a', '_', 'b_c'), ('c', '_', 'd_e'), NA, - ('f', '_', 'g_h')]) + ('f', '_', 'g_h'), None]) tm.assert_series_equal(result, exp) result = values.str.rpartition('_', expand=False) exp = Series([('a_b', '_', 'c'), ('c_d', '_', 'e'), NA, - ('f_g', '_', 'h')]) + ('f_g', '_', 'h'), None]) tm.assert_series_equal(result, exp) # more than one char - values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h']) + values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h', None]) result = values.str.partition('__', expand=False) exp = Series([('a', '__', 'b__c'), ('c', '__', 'd__e'), NA, - ('f', '__', 'g__h')]) + ('f', '__', 'g__h'), None]) tm.assert_series_equal(result, exp) result = values.str.rpartition('__', expand=False) exp = Series([('a__b', '__', 'c'), ('c__d', '__', 'e'), NA, - ('f__g', '__', 'h')]) + ('f__g', '__', 'h'), None]) tm.assert_series_equal(result, exp) # None - values = Series(['a b c', 'c d e', NA, 'f g h']) + values = Series(['a b c', 'c d e', NA, 'f g h', None]) result = values.str.partition(expand=False) exp = Series([('a', ' ', 'b c'), ('c', ' ', 'd e'), NA, - ('f', ' ', 'g h')]) + ('f', ' ', 'g h'), None]) tm.assert_series_equal(result, exp) result = values.str.rpartition(expand=False) exp = Series([('a b', ' ', 'c'), ('c d', ' ', 'e'), NA, - ('f g', ' ', 'h')]) + ('f g', ' ', 'h'), None]) tm.assert_series_equal(result, exp) # Not split - values = Series(['abc', 'cde', NA, 'fgh']) + values = Series(['abc', 'cde', NA, 'fgh', None]) result = values.str.partition('_', expand=False) - exp = Series([('abc', '', ''), ('cde', '', ''), NA, ('fgh', '', '')]) + exp = Series([('abc', '', ''), ('cde', '', ''), NA, + ('fgh', '', ''), None]) tm.assert_series_equal(result, exp) result = values.str.rpartition('_', expand=False) - exp = Series([('', '', 'abc'), ('', '', 'cde'), NA, ('', '', 'fgh')]) + exp = Series([('', '', 'abc'), ('', '', 'cde'), NA, + ('', '', 'fgh'), None]) tm.assert_series_equal(result, exp) # unicode @@ -2524,59 +2526,61 @@ def test_partition_series(self): assert result == [v.rpartition('_') for v in values] def test_partition_index(self): - values = Index(['a_b_c', 'c_d_e', 'f_g_h', np.nan]) + values = Index(['a_b_c', 'c_d_e', 'f_g_h', np.nan, None]) result = values.str.partition('_', expand=False) exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), - ('f', '_', 'g_h'), np.nan])) + ('f', '_', 'g_h'), np.nan, None])) tm.assert_index_equal(result, exp) assert result.nlevels == 1 result = values.str.rpartition('_', expand=False) exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), - ('f_g', '_', 'h'), np.nan])) + ('f_g', '_', 'h'), np.nan, None])) tm.assert_index_equal(result, exp) assert result.nlevels == 1 result = values.str.partition('_') exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), - ('f', '_', 'g_h'), (np.nan, np.nan, np.nan)]) + ('f', '_', 'g_h'), (np.nan, np.nan, np.nan), + (None, None, None)]) tm.assert_index_equal(result, exp) assert isinstance(result, MultiIndex) assert result.nlevels == 3 result = values.str.rpartition('_') exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), - ('f_g', '_', 'h'), (np.nan, np.nan, np.nan)]) + ('f_g', '_', 'h'), (np.nan, np.nan, np.nan), + (None, None, None)]) tm.assert_index_equal(result, exp) assert isinstance(result, MultiIndex) assert result.nlevels == 3 def test_partition_to_dataframe(self): - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None]) result = values.str.partition('_') - exp = DataFrame({0: ['a', 'c', np.nan, 'f'], - 1: ['_', '_', np.nan, '_'], - 2: ['b_c', 'd_e', np.nan, 'g_h']}) + exp = DataFrame({0: ['a', 'c', np.nan, 'f', None], + 1: ['_', '_', np.nan, '_', None], + 2: ['b_c', 'd_e', np.nan, 'g_h', None]}) tm.assert_frame_equal(result, exp) result = values.str.rpartition('_') - exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'], - 1: ['_', '_', np.nan, '_'], - 2: ['c', 'e', np.nan, 'h']}) + exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None], + 1: ['_', '_', np.nan, '_', None], + 2: ['c', 'e', np.nan, 'h', None]}) tm.assert_frame_equal(result, exp) - values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None]) result = values.str.partition('_', expand=True) - exp = DataFrame({0: ['a', 'c', np.nan, 'f'], - 1: ['_', '_', np.nan, '_'], - 2: ['b_c', 'd_e', np.nan, 'g_h']}) + exp = DataFrame({0: ['a', 'c', np.nan, 'f', None], + 1: ['_', '_', np.nan, '_', None], + 2: ['b_c', 'd_e', np.nan, 'g_h', None]}) tm.assert_frame_equal(result, exp) result = values.str.rpartition('_', expand=True) - exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'], - 1: ['_', '_', np.nan, '_'], - 2: ['c', 'e', np.nan, 'h']}) + exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None], + 1: ['_', '_', np.nan, '_', None], + 2: ['c', 'e', np.nan, 'h', None]}) tm.assert_frame_equal(result, exp) def test_partition_with_name(self): From 46b7e7c97dbb323e98e93e0faf0a598cf40d80a6 Mon Sep 17 00:00:00 2001 From: Markus Meier Date: Fri, 16 Nov 2018 00:22:58 +0100 Subject: [PATCH 5/8] TST: tests for bug Index.str.split(expand=True) not nan-safe (#23677) --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/tests/test_strings.py | 29 +++++++++++++++++++---------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 77ed9c7bd1060..df6ed7daf891f 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1261,7 +1261,7 @@ Numeric Strings ^^^^^^^ -- BUG in :func:`Index.str.partition` is not nan-safe (:issue:`23558`) +- BUG in :meth:`Index.str.partition` was not nan-safe (:issue:`23558`). - - diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 1095384b08e0a..b0986b8bd6ade 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2330,24 +2330,33 @@ def test_split_to_dataframe(self): s.str.split('_', expand="not_a_boolean") def test_split_to_multiindex_expand(self): - idx = Index(['nosplit', 'alsonosplit']) + idx = Index(['nosplit', 'alsonosplit', np.nan]) result = idx.str.split('_', expand=True) exp = idx tm.assert_index_equal(result, exp) assert result.nlevels == 1 - idx = Index(['some_equal_splits', 'with_no_nans']) + idx = Index(['some_equal_splits', 'with_no_nans', np.nan, None]) result = idx.str.split('_', expand=True) - exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), ( - 'with', 'no', 'nans')]) + exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), + ('with', 'no', 'nans'), + [np.nan, np.nan, np.nan], + [None, None, None]]) tm.assert_index_equal(result, exp) assert result.nlevels == 3 - idx = Index(['some_unequal_splits', 'one_of_these_things_is_not']) + idx = Index(['some_unequal_splits', + 'one_of_these_things_is_not', + np.nan, None]) result = idx.str.split('_', expand=True) - exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', NA, NA, NA - ), ('one', 'of', 'these', 'things', - 'is', 'not')]) + exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', + NA, NA, NA), + ('one', 'of', 'these', + 'things', 'is', 'not'), + (np.nan, np.nan, np.nan, + np.nan, np.nan, np.nan), + (None, None, None, + None, None, None)]) tm.assert_index_equal(result, exp) assert result.nlevels == 6 @@ -2480,12 +2489,12 @@ def test_partition_series(self): # Not split values = Series(['abc', 'cde', NA, 'fgh', None]) result = values.str.partition('_', expand=False) - exp = Series([('abc', '', ''), ('cde', '', ''), NA, + exp = Series([('abc', '', ''), ('cde', '', ''), NA, ('fgh', '', ''), None]) tm.assert_series_equal(result, exp) result = values.str.rpartition('_', expand=False) - exp = Series([('', '', 'abc'), ('', '', 'cde'), NA, + exp = Series([('', '', 'abc'), ('', '', 'cde'), NA, ('', '', 'fgh'), None]) tm.assert_series_equal(result, exp) From 25d5743ba64e8e525a156a1585497be616dd2928 Mon Sep 17 00:00:00 2001 From: Markus Meier Date: Fri, 16 Nov 2018 14:52:00 +0100 Subject: [PATCH 6/8] DOC: Appropriate formatting of whatsnew entry --- doc/source/whatsnew/v0.24.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index df6ed7daf891f..b39be2d0cb868 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1261,7 +1261,7 @@ Numeric Strings ^^^^^^^ -- BUG in :meth:`Index.str.partition` was not nan-safe (:issue:`23558`). +- Bug in :meth:`Index.str.partition` was not nan-safe (:issue:`23558`). - - From bb94c383b97a5bed2aecaebd55b862120e28545b Mon Sep 17 00:00:00 2001 From: Markus Meier Date: Fri, 16 Nov 2018 22:44:01 +0100 Subject: [PATCH 7/8] DOC Added resolved issue annotation to tests DOC Added whatsnew message for resolved issue #23677 --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/tests/test_strings.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index b39be2d0cb868..37d644e196391 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1262,7 +1262,7 @@ Strings ^^^^^^^ - Bug in :meth:`Index.str.partition` was not nan-safe (:issue:`23558`). -- +- Bug in :meth:`Index.str.split` was not nan-safe (:issue:`23677`). - Interval diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index b0986b8bd6ade..ceb78c65526fc 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2330,6 +2330,8 @@ def test_split_to_dataframe(self): s.str.split('_', expand="not_a_boolean") def test_split_to_multiindex_expand(self): + # https://github.com/pandas-dev/pandas/issues/23677 + idx = Index(['nosplit', 'alsonosplit', np.nan]) result = idx.str.split('_', expand=True) exp = idx @@ -2450,6 +2452,8 @@ def test_split_with_name(self): tm.assert_index_equal(res, exp) def test_partition_series(self): + # https://github.com/pandas-dev/pandas/issues/23558 + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None]) result = values.str.partition('_', expand=False) @@ -2519,6 +2523,8 @@ def test_partition_series(self): assert result == [v.rpartition('_') for v in values] def test_partition_index(self): + # https://github.com/pandas-dev/pandas/issues/23558 + values = Index(['a_b_c', 'c_d_e', 'f_g_h', np.nan, None]) result = values.str.partition('_', expand=False) @@ -2550,6 +2556,8 @@ def test_partition_index(self): assert result.nlevels == 3 def test_partition_to_dataframe(self): + # https://github.com/pandas-dev/pandas/issues/23558 + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None]) result = values.str.partition('_') exp = DataFrame({0: ['a', 'c', np.nan, 'f', None], From 25e3cc46537a5f3f9ef6fc265d0847c9cf422177 Mon Sep 17 00:00:00 2001 From: Markus Meier Date: Sun, 18 Nov 2018 02:04:54 +0100 Subject: [PATCH 8/8] flake8 fix - blank line --- pandas/tests/test_strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index ceb78c65526fc..42f0cebea83a0 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2453,7 +2453,7 @@ def test_split_with_name(self): def test_partition_series(self): # https://github.com/pandas-dev/pandas/issues/23558 - + values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None]) result = values.str.partition('_', expand=False)