Skip to content

Commit fee0a7b

Browse files
committed
ENH: improve extract and get_dummies methods for Index.str (fix for #9980)
1 parent 45f69cd commit fee0a7b

File tree

3 files changed

+55
-23
lines changed

3 files changed

+55
-23
lines changed

doc/source/whatsnew/v0.16.1.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,7 @@ Enhancements
4141
- ``DataFrame.diff`` now takes an ``axis`` parameter that determines the direction of differencing (:issue:`9727`)
4242
- Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)
4343
- Added ``StringMethods.normalize()`` which behaves the same as standard :func:`unicodedata.normalizes` (:issue:`10031`)
44-
4544
- Added ``StringMethods.partition()`` and ``rpartition()`` which behave as the same as standard ``str`` (:issue:`9773`)
46-
- Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s).
4745

4846
The ``.str`` accessor is now available for both ``Series`` and ``Index``.
4947

@@ -65,6 +63,8 @@ Enhancements
6563
idx.str.startswith('a')
6664
s[s.index.str.startswith('a')]
6765

66+
- Improved ``extract`` and ``get_dummies`` methods for ``Index.str`` (:issue:`9980`)
67+
- Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s).
6868
- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`)
6969

7070
- ``drop`` function can now accept ``errors`` keyword to suppress ``ValueError`` raised when any of label does not exist in the target data. (:issue:`6736`)

pandas/core/strings.py

+17-3
Original file line numberDiff line numberDiff line change
@@ -466,6 +466,7 @@ def str_extract(arr, pat, flags=0):
466466
"""
467467
from pandas.core.series import Series
468468
from pandas.core.frame import DataFrame
469+
from pandas.core.index import Index
469470

470471
regex = re.compile(pat, flags=flags)
471472
# just to be safe, check this
@@ -481,11 +482,19 @@ def f(x):
481482
return [np.nan if item is None else item for item in m.groups()]
482483
else:
483484
return empty_row
485+
484486
if regex.groups == 1:
485-
result = Series([f(val)[0] for val in arr],
486-
name=_get_single_group_name(regex),
487-
index=arr.index, dtype=object)
487+
if isinstance(arr, Index):
488+
result = Index([f(val)[0] for val in arr],
489+
name=_get_single_group_name(regex),
490+
dtype=object)
491+
else:
492+
result = Series([f(val)[0] for val in arr],
493+
name=_get_single_group_name(regex),
494+
index=arr.index, dtype=object)
488495
else:
496+
if isinstance(arr, Index):
497+
raise ValueError("only one regex group is supported with Index")
489498
names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
490499
columns = [names.get(1 + i, i) for i in range(regex.groups)]
491500
if arr.empty:
@@ -531,6 +540,11 @@ def str_get_dummies(arr, sep='|'):
531540
pandas.get_dummies
532541
"""
533542
from pandas.core.frame import DataFrame
543+
from pandas.core.index import Index
544+
545+
# GH9980, Index.str does not support get_dummies() as it returns a frame
546+
if isinstance(arr, Index):
547+
raise TypeError("get_dummies is not supported for string methods on Index")
534548

535549
# TODO remove this hack?
536550
arr = arr.fillna('')

pandas/tests/test_strings.py

+36-18
Original file line numberDiff line numberDiff line change
@@ -516,7 +516,6 @@ def test_match(self):
516516

517517
def test_extract(self):
518518
# Contains tests like those in test_match and some others.
519-
520519
values = Series(['fooBAD__barBAD', NA, 'foo'])
521520
er = [NA, NA] # empty row
522521

@@ -540,15 +539,31 @@ def test_extract(self):
540539
exp = DataFrame([[u('BAD__'), u('BAD')], er, er])
541540
tm.assert_frame_equal(result, exp)
542541

543-
# no groups
544-
s = Series(['A1', 'B2', 'C3'])
545-
f = lambda: s.str.extract('[ABC][123]')
546-
self.assertRaises(ValueError, f)
547-
548-
# only non-capturing groups
549-
f = lambda: s.str.extract('(?:[AB]).*')
550-
self.assertRaises(ValueError, f)
542+
# GH9980
543+
# Index only works with one regex group since
544+
# multi-group would expand to a frame
545+
idx = Index(['A1', 'A2', 'A3', 'A4', 'B5'])
546+
with tm.assertRaisesRegexp(ValueError, "supported"):
547+
idx.str.extract('([AB])([123])')
548+
549+
# these should work for both Series and Index
550+
for klass in [Series, Index]:
551+
# no groups
552+
s_or_idx = klass(['A1', 'B2', 'C3'])
553+
f = lambda: s_or_idx.str.extract('[ABC][123]')
554+
self.assertRaises(ValueError, f)
555+
556+
# only non-capturing groups
557+
f = lambda: s_or_idx.str.extract('(?:[AB]).*')
558+
self.assertRaises(ValueError, f)
559+
560+
# single group renames series/index properly
561+
s_or_idx = klass(['A1', 'A2'])
562+
result = s_or_idx.str.extract(r'(?P<uno>A)\d')
563+
tm.assert_equal(result.name, 'uno')
564+
tm.assert_array_equal(result, klass(['A', 'A']))
551565

566+
s = Series(['A1', 'B2', 'C3'])
552567
# one group, no matches
553568
result = s.str.extract('(_)')
554569
exp = Series([NA, NA, NA], dtype=object)
@@ -569,14 +584,16 @@ def test_extract(self):
569584
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
570585
tm.assert_frame_equal(result, exp)
571586

572-
# named group/groups
573-
result = s.str.extract('(?P<letter>[AB])(?P<number>[123])')
574-
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number'])
575-
tm.assert_frame_equal(result, exp)
587+
# one named group
576588
result = s.str.extract('(?P<letter>[AB])')
577589
exp = Series(['A', 'B', NA], name='letter')
578590
tm.assert_series_equal(result, exp)
579591

592+
# two named groups
593+
result = s.str.extract('(?P<letter>[AB])(?P<number>[123])')
594+
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number'])
595+
tm.assert_frame_equal(result, exp)
596+
580597
# mix named and unnamed groups
581598
result = s.str.extract('([AB])(?P<number>[123])')
582599
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=[0, 'number'])
@@ -602,11 +619,6 @@ def test_extract(self):
602619
exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number'])
603620
tm.assert_frame_equal(result, exp)
604621

605-
# single group renames series properly
606-
s = Series(['A1', 'A2'])
607-
result = s.str.extract(r'(?P<uno>A)\d')
608-
tm.assert_equal(result.name, 'uno')
609-
610622
# GH6348
611623
# not passing index to the extractor
612624
def check_index(index):
@@ -761,6 +773,12 @@ def test_get_dummies(self):
761773
columns=list('7ab'))
762774
tm.assert_frame_equal(result, expected)
763775

776+
# GH9980
777+
# Index.str does not support get_dummies() as it returns a frame
778+
with tm.assertRaisesRegexp(TypeError, "not supported"):
779+
idx = Index(['a|b', 'a|c', 'b|c'])
780+
idx.str.get_dummies('|')
781+
764782
def test_join(self):
765783
values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
766784
result = values.str.split('_').str.join('_')

0 commit comments

Comments
 (0)