Skip to content

Commit 11e3b5a

Browse files
committed
ENH: improve extract and get_dummies methods for Index.str (fix for #9980)
1 parent 845cec9 commit 11e3b5a

File tree

3 files changed

+56
-23
lines changed

3 files changed

+56
-23
lines changed

doc/source/whatsnew/v0.16.1.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ Enhancements
2424

2525
- Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
2626
- Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)
27-
- Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s).
2827

2928
The ``.str`` accessor is now available for both ``Series`` and ``Index``.
3029

@@ -46,6 +45,8 @@ Enhancements
4645
idx.str.startswith('a')
4746
s[s.index.str.startswith('a')]
4847

48+
- Improved ``extract`` and ``get_dummies`` methods for ``Index.str`` (:issue:`9980`)
49+
- Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s).
4950
- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`)
5051

5152
- ``drop`` function can now accept ``errors`` keyword to suppress ``ValueError`` raised when any of label does not exist in the target data. (:issue:`6736`)

pandas/core/strings.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -435,12 +435,13 @@ def str_extract(arr, pat, flags=0):
435435
"""
436436
from pandas.core.series import Series
437437
from pandas.core.frame import DataFrame
438+
from pandas.core.index import Index
438439

439440
regex = re.compile(pat, flags=flags)
440441
# just to be safe, check this
441442
if regex.groups == 0:
442443
raise ValueError("This pattern contains no groups to capture.")
443-
empty_row = [np.nan]*regex.groups
444+
empty_row = [np.nan] * regex.groups
444445
def f(x):
445446
if not isinstance(x, compat.string_types):
446447
return empty_row
@@ -449,11 +450,19 @@ def f(x):
449450
return [np.nan if item is None else item for item in m.groups()]
450451
else:
451452
return empty_row
453+
452454
if regex.groups == 1:
453-
result = Series([f(val)[0] for val in arr],
454-
name=_get_single_group_name(regex),
455-
index=arr.index, dtype=object)
455+
if isinstance(arr, Index):
456+
result = Index([f(val)[0] for val in arr],
457+
name=_get_single_group_name(regex),
458+
dtype=object)
459+
else:
460+
result = Series([f(val)[0] for val in arr],
461+
name=_get_single_group_name(regex),
462+
index=arr.index, dtype=object)
456463
else:
464+
if isinstance(arr, Index):
465+
raise ValueError("only one regex group is supported with Index")
457466
names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
458467
columns = [names.get(1 + i, i) for i in range(regex.groups)]
459468
if arr.empty:
@@ -488,6 +497,11 @@ def str_get_dummies(arr, sep='|'):
488497
489498
"""
490499
from pandas.core.frame import DataFrame
500+
from pandas.core.index import Index
501+
502+
# GH9980, Index.str does not support get_dummies() as it returns a frame
503+
if isinstance(arr, Index):
504+
raise TypeError("get_dummies is not supported for string methods on Index")
491505

492506
# TODO remove this hack?
493507
arr = arr.fillna('')

pandas/tests/test_strings.py

+36-18
Original file line numberDiff line numberDiff line change
@@ -516,7 +516,6 @@ def test_match(self):
516516

517517
def test_extract(self):
518518
# Contains tests like those in test_match and some others.
519-
520519
values = Series(['fooBAD__barBAD', NA, 'foo'])
521520
er = [NA, NA] # empty row
522521

@@ -540,15 +539,31 @@ def test_extract(self):
540539
exp = DataFrame([[u('BAD__'), u('BAD')], er, er])
541540
tm.assert_frame_equal(result, exp)
542541

543-
# no groups
544-
s = Series(['A1', 'B2', 'C3'])
545-
f = lambda: s.str.extract('[ABC][123]')
546-
self.assertRaises(ValueError, f)
547-
548-
# only non-capturing groups
549-
f = lambda: s.str.extract('(?:[AB]).*')
550-
self.assertRaises(ValueError, f)
542+
# GH9980
543+
# Index only works with one regex group since
544+
# multi-group would expand to a frame
545+
idx = Index(['A1', 'A2', 'A3', 'A4', 'B5'])
546+
with tm.assertRaisesRegexp(ValueError, "supported"):
547+
idx.str.extract('([AB])([123])')
548+
549+
# these should work for both Series and Index
550+
for klass in [Series, Index]:
551+
# no groups
552+
s_or_idx = klass(['A1', 'B2', 'C3'])
553+
f = lambda: s_or_idx.str.extract('[ABC][123]')
554+
self.assertRaises(ValueError, f)
555+
556+
# only non-capturing groups
557+
f = lambda: s_or_idx.str.extract('(?:[AB]).*')
558+
self.assertRaises(ValueError, f)
559+
560+
# single group renames series/index properly
561+
s_or_idx = klass(['A1', 'A2'])
562+
result = s_or_idx.str.extract(r'(?P<uno>A)\d')
563+
tm.assert_equal(result.name, 'uno')
564+
tm.assert_array_equal(result, klass(['A', 'A']))
551565

566+
s = Series(['A1', 'B2', 'C3'])
552567
# one group, no matches
553568
result = s.str.extract('(_)')
554569
exp = Series([NA, NA, NA], dtype=object)
@@ -569,14 +584,16 @@ def test_extract(self):
569584
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
570585
tm.assert_frame_equal(result, exp)
571586

572-
# named group/groups
573-
result = s.str.extract('(?P<letter>[AB])(?P<number>[123])')
574-
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number'])
575-
tm.assert_frame_equal(result, exp)
587+
# one named group
576588
result = s.str.extract('(?P<letter>[AB])')
577589
exp = Series(['A', 'B', NA], name='letter')
578590
tm.assert_series_equal(result, exp)
579591

592+
# two named groups
593+
result = s.str.extract('(?P<letter>[AB])(?P<number>[123])')
594+
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number'])
595+
tm.assert_frame_equal(result, exp)
596+
580597
# mix named and unnamed groups
581598
result = s.str.extract('([AB])(?P<number>[123])')
582599
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=[0, 'number'])
@@ -602,11 +619,6 @@ def test_extract(self):
602619
exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number'])
603620
tm.assert_frame_equal(result, exp)
604621

605-
# single group renames series properly
606-
s = Series(['A1', 'A2'])
607-
result = s.str.extract(r'(?P<uno>A)\d')
608-
tm.assert_equal(result.name, 'uno')
609-
610622
# GH6348
611623
# not passing index to the extractor
612624
def check_index(index):
@@ -752,6 +764,12 @@ def test_get_dummies(self):
752764
columns=list('7ab'))
753765
tm.assert_frame_equal(result, expected)
754766

767+
# GH9980
768+
# Index.str does not support get_dummies() as it returns a frame
769+
with tm.assertRaisesRegexp(TypeError, "not supported"):
770+
idx = Index(['a|b', 'a|c', 'b|c'])
771+
idx.str.get_dummies('|')
772+
755773
def test_join(self):
756774
values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
757775
result = values.str.split('_').str.join('_')

0 commit comments

Comments
 (0)