Skip to content

Commit 01d85cb

Browse files
committed
ENH: improve extract and get_dummies methods for Index.str (fix for #9980)
1 parent 76571d0 commit 01d85cb

File tree

2 files changed

+52
-21
lines changed

2 files changed

+52
-21
lines changed

pandas/core/strings.py

+17-3
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,7 @@ def str_extract(arr, pat, flags=0):
435435
"""
436436
from pandas.core.series import Series
437437
from pandas.core.frame import DataFrame
438+
from pandas.core.index import Index
438439

439440
regex = re.compile(pat, flags=flags)
440441
# just to be safe, check this
@@ -449,11 +450,19 @@ def f(x):
449450
return [np.nan if item is None else item for item in m.groups()]
450451
else:
451452
return empty_row
453+
452454
if regex.groups == 1:
453-
result = Series([f(val)[0] for val in arr],
454-
name=_get_single_group_name(regex),
455-
index=arr.index, dtype=object)
455+
if isinstance(arr, Index):
456+
result = Index([f(val)[0] for val in arr],
457+
name=_get_single_group_name(regex),
458+
dtype=object)
459+
else:
460+
result = Series([f(val)[0] for val in arr],
461+
name=_get_single_group_name(regex),
462+
index=arr.index, dtype=object)
456463
else:
464+
if isinstance(arr, Index):
465+
raise ValueError("only one regex group is supported with Index")
457466
names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
458467
columns = [names.get(1 + i, i) for i in range(regex.groups)]
459468
if arr.empty:
@@ -488,6 +497,11 @@ def str_get_dummies(arr, sep='|'):
488497
489498
"""
490499
from pandas.core.frame import DataFrame
500+
from pandas.core.index import Index
501+
502+
# GH9980, Index.str does not support get_dummies() as it returns a frame
503+
if isinstance(arr, Index):
504+
raise TypeError("get_dummies is not supported for string methods on Index")
491505

492506
# TODO remove this hack?
493507
arr = arr.fillna('')

pandas/tests/test_strings.py

+35-18
Original file line numberDiff line numberDiff line change
@@ -516,7 +516,6 @@ def test_match(self):
516516

517517
def test_extract(self):
518518
# Contains tests like those in test_match and some others.
519-
520519
values = Series(['fooBAD__barBAD', NA, 'foo'])
521520
er = [NA, NA] # empty row
522521

@@ -540,15 +539,30 @@ def test_extract(self):
540539
exp = DataFrame([[u('BAD__'), u('BAD')], er, er])
541540
tm.assert_frame_equal(result, exp)
542541

543-
# no groups
544-
s = Series(['A1', 'B2', 'C3'])
545-
f = lambda: s.str.extract('[ABC][123]')
546-
self.assertRaises(ValueError, f)
547-
548-
# only non-capturing groups
549-
f = lambda: s.str.extract('(?:[AB]).*')
550-
self.assertRaises(ValueError, f)
542+
# Index only works with one regex group since
543+
# multi-group would expand to a frame
544+
idx = Index(['A1', 'A2', 'A3', 'A4', 'B5'])
545+
with tm.assertRaisesRegexp(ValueError, "supported"):
546+
idx.str.extract('([AB])([123])')
547+
548+
# these should work for both Series and Index
549+
for klass in [Series, Index]:
550+
# no groups
551+
s_or_idx = klass(['A1', 'B2', 'C3'])
552+
f = lambda: s_or_idx.str.extract('[ABC][123]')
553+
self.assertRaises(ValueError, f)
554+
555+
# only non-capturing groups
556+
f = lambda: s_or_idx.str.extract('(?:[AB]).*')
557+
self.assertRaises(ValueError, f)
558+
559+
# single group renames series/index properly
560+
s_or_idx = klass(['A1', 'A2'])
561+
result = s_or_idx.str.extract(r'(?P<uno>A)\d')
562+
tm.assert_equal(result.name, 'uno')
563+
tm.assert_array_equal(result, klass(['A', 'A']))
551564

565+
s = Series(['A1', 'B2', 'C3'])
552566
# one group, no matches
553567
result = s.str.extract('(_)')
554568
exp = Series([NA, NA, NA], dtype=object)
@@ -569,14 +583,16 @@ def test_extract(self):
569583
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
570584
tm.assert_frame_equal(result, exp)
571585

572-
# named group/groups
573-
result = s.str.extract('(?P<letter>[AB])(?P<number>[123])')
574-
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number'])
575-
tm.assert_frame_equal(result, exp)
586+
# one named group
576587
result = s.str.extract('(?P<letter>[AB])')
577588
exp = Series(['A', 'B', NA], name='letter')
578589
tm.assert_series_equal(result, exp)
579590

591+
# two named groups
592+
result = s.str.extract('(?P<letter>[AB])(?P<number>[123])')
593+
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number'])
594+
tm.assert_frame_equal(result, exp)
595+
580596
# mix named and unnamed groups
581597
result = s.str.extract('([AB])(?P<number>[123])')
582598
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=[0, 'number'])
@@ -602,11 +618,6 @@ def test_extract(self):
602618
exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number'])
603619
tm.assert_frame_equal(result, exp)
604620

605-
# single group renames series properly
606-
s = Series(['A1', 'A2'])
607-
result = s.str.extract(r'(?P<uno>A)\d')
608-
tm.assert_equal(result.name, 'uno')
609-
610621
# GH6348
611622
# not passing index to the extractor
612623
def check_index(index):
@@ -752,6 +763,12 @@ def test_get_dummies(self):
752763
columns=list('7ab'))
753764
tm.assert_frame_equal(result, expected)
754765

766+
# GH9980
767+
# Index.str does not support get_dummies() as it returns a frame
768+
with tm.assertRaisesRegexp(TypeError, "not supported"):
769+
idx = Index(['a|b', 'a|c', 'b|c'])
770+
idx.str.get_dummies('|')
771+
755772
def test_join(self):
756773
values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
757774
result = values.str.split('_').str.join('_')

0 commit comments

Comments
 (0)