Skip to content

BUG: Index.str.partition not nan-safe (#23558) #23618

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Nov 18, 2018
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1261,7 +1261,7 @@ Numeric
Strings
^^^^^^^

-
- BUG in :meth:`Index.str.partition` was not nan-safe (:issue:`23558`).
-
-

Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2273,7 +2273,7 @@ def to_object_array_tuples(rows: list):

k = 0
for i in range(n):
tmp = len(rows[i])
tmp = 1 if checknull(rows[i]) else len(rows[i])
if tmp > k:
k = tmp

Expand All @@ -2287,7 +2287,7 @@ def to_object_array_tuples(rows: list):
except Exception:
# upcast any subclasses to tuple
for i in range(n):
row = tuple(rows[i])
row = (rows[i],) if checknull(rows[i]) else tuple(rows[i])
for j in range(len(row)):
result[i, j] = row[j]

Expand Down
99 changes: 57 additions & 42 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2330,24 +2330,33 @@ def test_split_to_dataframe(self):
s.str.split('_', expand="not_a_boolean")

def test_split_to_multiindex_expand(self):
idx = Index(['nosplit', 'alsonosplit'])
idx = Index(['nosplit', 'alsonosplit', np.nan])
result = idx.str.split('_', expand=True)
exp = idx
tm.assert_index_equal(result, exp)
assert result.nlevels == 1

idx = Index(['some_equal_splits', 'with_no_nans'])
idx = Index(['some_equal_splits', 'with_no_nans', np.nan, None])
result = idx.str.split('_', expand=True)
exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), (
'with', 'no', 'nans')])
exp = MultiIndex.from_tuples([('some', 'equal', 'splits'),
('with', 'no', 'nans'),
[np.nan, np.nan, np.nan],
[None, None, None]])
tm.assert_index_equal(result, exp)
assert result.nlevels == 3

idx = Index(['some_unequal_splits', 'one_of_these_things_is_not'])
idx = Index(['some_unequal_splits',
'one_of_these_things_is_not',
np.nan, None])
result = idx.str.split('_', expand=True)
exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', NA, NA, NA
), ('one', 'of', 'these', 'things',
'is', 'not')])
exp = MultiIndex.from_tuples([('some', 'unequal', 'splits',
NA, NA, NA),
('one', 'of', 'these',
'things', 'is', 'not'),
(np.nan, np.nan, np.nan,
np.nan, np.nan, np.nan),
(None, None, None,
None, None, None)])
tm.assert_index_equal(result, exp)
assert result.nlevels == 6

Expand Down Expand Up @@ -2441,50 +2450,52 @@ def test_split_with_name(self):
tm.assert_index_equal(res, exp)

def test_partition_series(self):
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None])

result = values.str.partition('_', expand=False)
exp = Series([('a', '_', 'b_c'), ('c', '_', 'd_e'), NA,
('f', '_', 'g_h')])
('f', '_', 'g_h'), None])
tm.assert_series_equal(result, exp)

result = values.str.rpartition('_', expand=False)
exp = Series([('a_b', '_', 'c'), ('c_d', '_', 'e'), NA,
('f_g', '_', 'h')])
('f_g', '_', 'h'), None])
tm.assert_series_equal(result, exp)

# more than one char
values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h', None])
result = values.str.partition('__', expand=False)
exp = Series([('a', '__', 'b__c'), ('c', '__', 'd__e'), NA,
('f', '__', 'g__h')])
('f', '__', 'g__h'), None])
tm.assert_series_equal(result, exp)

result = values.str.rpartition('__', expand=False)
exp = Series([('a__b', '__', 'c'), ('c__d', '__', 'e'), NA,
('f__g', '__', 'h')])
('f__g', '__', 'h'), None])
tm.assert_series_equal(result, exp)

# None
values = Series(['a b c', 'c d e', NA, 'f g h'])
values = Series(['a b c', 'c d e', NA, 'f g h', None])
result = values.str.partition(expand=False)
exp = Series([('a', ' ', 'b c'), ('c', ' ', 'd e'), NA,
('f', ' ', 'g h')])
('f', ' ', 'g h'), None])
tm.assert_series_equal(result, exp)

result = values.str.rpartition(expand=False)
exp = Series([('a b', ' ', 'c'), ('c d', ' ', 'e'), NA,
('f g', ' ', 'h')])
('f g', ' ', 'h'), None])
tm.assert_series_equal(result, exp)

# Not splited
values = Series(['abc', 'cde', NA, 'fgh'])
# Not split
values = Series(['abc', 'cde', NA, 'fgh', None])
result = values.str.partition('_', expand=False)
exp = Series([('abc', '', ''), ('cde', '', ''), NA, ('fgh', '', '')])
exp = Series([('abc', '', ''), ('cde', '', ''), NA,
('fgh', '', ''), None])
tm.assert_series_equal(result, exp)

result = values.str.rpartition('_', expand=False)
exp = Series([('', '', 'abc'), ('', '', 'cde'), NA, ('', '', 'fgh')])
exp = Series([('', '', 'abc'), ('', '', 'cde'), NA,
('', '', 'fgh'), None])
tm.assert_series_equal(result, exp)

# unicode
Expand All @@ -2508,57 +2519,61 @@ def test_partition_series(self):
assert result == [v.rpartition('_') for v in values]

def test_partition_index(self):
values = Index(['a_b_c', 'c_d_e', 'f_g_h'])
values = Index(['a_b_c', 'c_d_e', 'f_g_h', np.nan, None])

result = values.str.partition('_', expand=False)
exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_',
'g_h')]))
exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'),
('f', '_', 'g_h'), np.nan, None]))
tm.assert_index_equal(result, exp)
assert result.nlevels == 1

result = values.str.rpartition('_', expand=False)
exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), (
'f_g', '_', 'h')]))
exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'),
('f_g', '_', 'h'), np.nan, None]))
tm.assert_index_equal(result, exp)
assert result.nlevels == 1

result = values.str.partition('_')
exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')])
exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'),
('f', '_', 'g_h'), (np.nan, np.nan, np.nan),
(None, None, None)])
tm.assert_index_equal(result, exp)
assert isinstance(result, MultiIndex)
assert result.nlevels == 3

result = values.str.rpartition('_')
exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')])
exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'),
('f_g', '_', 'h'), (np.nan, np.nan, np.nan),
(None, None, None)])
tm.assert_index_equal(result, exp)
assert isinstance(result, MultiIndex)
assert result.nlevels == 3

def test_partition_to_dataframe(self):
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None])
result = values.str.partition('_')
exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
1: ['_', '_', np.nan, '_'],
2: ['b_c', 'd_e', np.nan, 'g_h']})
exp = DataFrame({0: ['a', 'c', np.nan, 'f', None],
1: ['_', '_', np.nan, '_', None],
2: ['b_c', 'd_e', np.nan, 'g_h', None]})
tm.assert_frame_equal(result, exp)

result = values.str.rpartition('_')
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
1: ['_', '_', np.nan, '_'],
2: ['c', 'e', np.nan, 'h']})
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None],
1: ['_', '_', np.nan, '_', None],
2: ['c', 'e', np.nan, 'h', None]})
tm.assert_frame_equal(result, exp)

values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None])
result = values.str.partition('_', expand=True)
exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
1: ['_', '_', np.nan, '_'],
2: ['b_c', 'd_e', np.nan, 'g_h']})
exp = DataFrame({0: ['a', 'c', np.nan, 'f', None],
1: ['_', '_', np.nan, '_', None],
2: ['b_c', 'd_e', np.nan, 'g_h', None]})
tm.assert_frame_equal(result, exp)

result = values.str.rpartition('_', expand=True)
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
1: ['_', '_', np.nan, '_'],
2: ['c', 'e', np.nan, 'h']})
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None],
1: ['_', '_', np.nan, '_', None],
2: ['c', 'e', np.nan, 'h', None]})
tm.assert_frame_equal(result, exp)

def test_partition_with_name(self):
Expand Down