Skip to content

Commit ed854ef

Browse files
committed
ENH/BUG: str.extractall doesn't support index
1 parent 4de83d2 commit ed854ef

File tree

4 files changed

+68
-17
lines changed

4 files changed

+68
-17
lines changed

doc/source/text.rst

+12-1
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ Unlike ``extract`` (which returns only the first match),
281281

282282
.. ipython:: python
283283
284-
s = pd.Series(["a1a2", "b1", "c1"], ["A", "B", "C"])
284+
s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])
285285
s
286286
two_groups = '(?P<letter>[a-z])(?P<digit>[0-9])'
287287
s.str.extract(two_groups, expand=True)
@@ -313,6 +313,17 @@ then ``extractall(pat).xs(0, level='match')`` gives the same result as
313313
extractall_result
314314
extractall_result.xs(0, level="match")
315315
316+
``Index`` also supports ``.str.extractall``. It returns a ``DataFrame`` which has the
317+
same result as a ``Series.str.extractall`` with a default index (starts from 0).
318+
319+
.. versionadded:: 0.18.2
320+
321+
.. ipython:: python
322+
323+
pd.Index(["a1a2", "b1", "c1"]).str.extractall(two_groups)
324+
325+
pd.Series(["a1a2", "b1", "c1"]).str.extractall(two_groups)
326+
316327
317328
Testing for Strings that Match or Contain a Pattern
318329
---------------------------------------------------

doc/source/whatsnew/v0.18.2.txt

+6
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,12 @@ Other enhancements
3131

3232
- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`)
3333

34+
- ``Index`` now supports ``.str.extractall()`` which returns ``DataFrame``, see :ref:`Extract all matches in each subject (extractall) <text.extractall>` (:issue:`10008`, :issue:`13156`)
3435

36+
.. ipython:: python
37+
38+
idx = pd.Index(["a1a2", "b1", "c1"])
39+
idx.str.extractall("[ab](?P<digit>\d)")
3540

3641

3742
.. _whatsnew_0182.api:
@@ -117,6 +122,7 @@ Bug Fixes
117122

118123

119124

125+
- Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`)
120126

121127

122128
- Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`)

pandas/core/strings.py

+24-14
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from pandas.core.algorithms import take_1d
99
import pandas.compat as compat
1010
from pandas.core.base import AccessorProperty, NoNewAttributesMixin
11+
from pandas.types import api as gt
1112
from pandas.util.decorators import Appender, deprecate_kwarg
1213
import re
1314
import pandas.lib as lib
@@ -148,12 +149,10 @@ def _na_map(f, arr, na_result=np.nan, dtype=object):
148149

149150

150151
def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object):
151-
from pandas.core.series import Series
152-
153152
if not len(arr):
154153
return np.ndarray(0, dtype=dtype)
155154

156-
if isinstance(arr, Series):
155+
if isinstance(arr, gt.ABCSeries):
157156
arr = arr.values
158157
if not isinstance(arr, np.ndarray):
159158
arr = np.asarray(arr, dtype=object)
@@ -687,33 +686,42 @@ def str_extractall(arr, pat, flags=0):
687686
C 0 NaN 1
688687
689688
"""
690-
from pandas import DataFrame, MultiIndex
689+
691690
regex = re.compile(pat, flags=flags)
692691
# the regex must contain capture groups.
693692
if regex.groups == 0:
694693
raise ValueError("pattern contains no capture groups")
694+
695+
if isinstance(arr, gt.ABCIndex):
696+
arr = arr.to_series().reset_index(drop=True)
697+
695698
names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
696699
columns = [names.get(1 + i, i) for i in range(regex.groups)]
697700
match_list = []
698701
index_list = []
702+
is_mi = arr.index.nlevels > 1
703+
699704
for subject_key, subject in arr.iteritems():
700705
if isinstance(subject, compat.string_types):
701-
try:
702-
key_list = list(subject_key)
703-
except TypeError:
704-
key_list = [subject_key]
706+
707+
if not is_mi:
708+
subject_key = (subject_key, )
709+
705710
for match_i, match_tuple in enumerate(regex.findall(subject)):
706-
na_tuple = [
707-
np.NaN if group == "" else group for group in match_tuple]
711+
na_tuple = [np.NaN if group == "" else group
712+
for group in match_tuple]
708713
match_list.append(na_tuple)
709-
result_key = tuple(key_list + [match_i])
714+
result_key = tuple(subject_key + (match_i, ))
710715
index_list.append(result_key)
716+
711717
if 0 < len(index_list):
718+
from pandas import MultiIndex
712719
index = MultiIndex.from_tuples(
713720
index_list, names=arr.index.names + ["match"])
714721
else:
715722
index = None
716-
result = DataFrame(match_list, index, columns)
723+
result = arr._constructor_expanddim(match_list, index=index,
724+
columns=columns)
717725
return result
718726

719727

@@ -1804,9 +1812,9 @@ class StringAccessorMixin(object):
18041812

18051813
# string methods
18061814
def _make_str_accessor(self):
1807-
from pandas.core.series import Series
18081815
from pandas.core.index import Index
1809-
if (isinstance(self, Series) and
1816+
1817+
if (isinstance(self, gt.ABCSeries) and
18101818
not ((is_categorical_dtype(self.dtype) and
18111819
is_object_dtype(self.values.categories)) or
18121820
(is_object_dtype(self.dtype)))):
@@ -1819,6 +1827,8 @@ def _make_str_accessor(self):
18191827
"values, which use np.object_ dtype in "
18201828
"pandas")
18211829
elif isinstance(self, Index):
1830+
# can't use ABCIndex to exclude non-str
1831+
18221832
# see scc/inferrence.pyx which can contain string values
18231833
allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer')
18241834
if self.inferred_type not in allowed_types:

pandas/tests/test_strings.py

+26-2
Original file line numberDiff line numberDiff line change
@@ -982,6 +982,30 @@ def test_extractall_no_matches(self):
982982
"second"])
983983
tm.assert_frame_equal(r, e)
984984

985+
def test_extractall_stringindex(self):
986+
s = Series(["a1a2", "b1", "c1"], name='xxx')
987+
res = s.str.extractall("[ab](?P<digit>\d)")
988+
exp_idx = MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)],
989+
names=[None, 'match'])
990+
exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx)
991+
tm.assert_frame_equal(res, exp)
992+
993+
# index should return the same result as the default index without name
994+
# thus index.name doesn't affect to the result
995+
for idx in [Index(["a1a2", "b1", "c1"]),
996+
Index(["a1a2", "b1", "c1"], name='xxx')]:
997+
998+
res = idx.str.extractall("[ab](?P<digit>\d)")
999+
tm.assert_frame_equal(res, exp)
1000+
1001+
s = Series(["a1a2", "b1", "c1"], name='s_name',
1002+
index=Index(["XX", "yy", "zz"], name='idx_name'))
1003+
res = s.str.extractall("[ab](?P<digit>\d)")
1004+
exp_idx = MultiIndex.from_tuples([("XX", 0), ("XX", 1), ("yy", 0)],
1005+
names=["idx_name", 'match'])
1006+
exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx)
1007+
tm.assert_frame_equal(res, exp)
1008+
9851009
def test_extractall_errors(self):
9861010
# Does not make sense to use extractall with a regex that has
9871011
# no capture groups. (it returns DataFrame with one column for
@@ -991,8 +1015,8 @@ def test_extractall_errors(self):
9911015
s.str.extractall(r'[a-z]')
9921016

9931017
def test_extract_index_one_two_groups(self):
994-
s = Series(
995-
['a3', 'b3', 'd4c2'], ["A3", "B3", "D4"], name='series_name')
1018+
s = Series(['a3', 'b3', 'd4c2'], index=["A3", "B3", "D4"],
1019+
name='series_name')
9961020
r = s.index.str.extract(r'([A-Z])', expand=True)
9971021
e = DataFrame(['A', "B", "D"])
9981022
tm.assert_frame_equal(r, e)

0 commit comments

Comments
 (0)