diff --git a/doc/source/text.rst b/doc/source/text.rst index 16b16a320f75b..3822c713d7f85 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -281,7 +281,7 @@ Unlike ``extract`` (which returns only the first match), .. ipython:: python - s = pd.Series(["a1a2", "b1", "c1"], ["A", "B", "C"]) + s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) s two_groups = '(?P[a-z])(?P[0-9])' s.str.extract(two_groups, expand=True) @@ -313,6 +313,17 @@ then ``extractall(pat).xs(0, level='match')`` gives the same result as extractall_result extractall_result.xs(0, level="match") +``Index`` also supports ``.str.extractall``. It returns a ``DataFrame`` which has the +same result as a ``Series.str.extractall`` with a default index (starts from 0). + +.. versionadded:: 0.18.2 + +.. ipython:: python + + pd.Index(["a1a2", "b1", "c1"]).str.extractall(two_groups) + + pd.Series(["a1a2", "b1", "c1"]).str.extractall(two_groups) + Testing for Strings that Match or Contain a Pattern --------------------------------------------------- diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 85209c0dfa03d..178c3b858b94e 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -31,7 +31,12 @@ Other enhancements - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) +- ``Index`` now supports ``.str.extractall()`` which returns ``DataFrame``, see :ref:`Extract all matches in each subject (extractall) ` (:issue:`10008`, :issue:`13156`) + .. ipython:: python + + idx = pd.Index(["a1a2", "b1", "c1"]) + idx.str.extractall("[ab](?P\d)") .. _whatsnew_0182.api: @@ -117,6 +122,7 @@ Bug Fixes +- Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`) - Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 524c0205d7f73..5b1b8bd05af42 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -8,6 +8,7 @@ from pandas.core.algorithms import take_1d import pandas.compat as compat from pandas.core.base import AccessorProperty, NoNewAttributesMixin +from pandas.types import api as gt from pandas.util.decorators import Appender, deprecate_kwarg import re import pandas.lib as lib @@ -148,12 +149,10 @@ def _na_map(f, arr, na_result=np.nan, dtype=object): def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): - from pandas.core.series import Series - if not len(arr): return np.ndarray(0, dtype=dtype) - if isinstance(arr, Series): + if isinstance(arr, gt.ABCSeries): arr = arr.values if not isinstance(arr, np.ndarray): arr = np.asarray(arr, dtype=object) @@ -687,33 +686,42 @@ def str_extractall(arr, pat, flags=0): C 0 NaN 1 """ - from pandas import DataFrame, MultiIndex + regex = re.compile(pat, flags=flags) # the regex must contain capture groups. if regex.groups == 0: raise ValueError("pattern contains no capture groups") + + if isinstance(arr, gt.ABCIndex): + arr = arr.to_series().reset_index(drop=True) + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) columns = [names.get(1 + i, i) for i in range(regex.groups)] match_list = [] index_list = [] + is_mi = arr.index.nlevels > 1 + for subject_key, subject in arr.iteritems(): if isinstance(subject, compat.string_types): - try: - key_list = list(subject_key) - except TypeError: - key_list = [subject_key] + + if not is_mi: + subject_key = (subject_key, ) + for match_i, match_tuple in enumerate(regex.findall(subject)): - na_tuple = [ - np.NaN if group == "" else group for group in match_tuple] + na_tuple = [np.NaN if group == "" else group + for group in match_tuple] match_list.append(na_tuple) - result_key = tuple(key_list + [match_i]) + result_key = tuple(subject_key + (match_i, )) index_list.append(result_key) + if 0 < len(index_list): + from pandas import MultiIndex index = MultiIndex.from_tuples( index_list, names=arr.index.names + ["match"]) else: index = None - result = DataFrame(match_list, index, columns) + result = arr._constructor_expanddim(match_list, index=index, + columns=columns) return result @@ -1804,9 +1812,9 @@ class StringAccessorMixin(object): # string methods def _make_str_accessor(self): - from pandas.core.series import Series from pandas.core.index import Index - if (isinstance(self, Series) and + + if (isinstance(self, gt.ABCSeries) and not ((is_categorical_dtype(self.dtype) and is_object_dtype(self.values.categories)) or (is_object_dtype(self.dtype)))): @@ -1819,6 +1827,8 @@ def _make_str_accessor(self): "values, which use np.object_ dtype in " "pandas") elif isinstance(self, Index): + # can't use ABCIndex to exclude non-str + # see scc/inferrence.pyx which can contain string values allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') if self.inferred_type not in allowed_types: diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 4179949bc49a6..05525acedc245 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -982,6 +982,30 @@ def test_extractall_no_matches(self): "second"]) tm.assert_frame_equal(r, e) + def test_extractall_stringindex(self): + s = Series(["a1a2", "b1", "c1"], name='xxx') + res = s.str.extractall("[ab](?P\d)") + exp_idx = MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)], + names=[None, 'match']) + exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx) + tm.assert_frame_equal(res, exp) + + # index should return the same result as the default index without name + # thus index.name doesn't affect to the result + for idx in [Index(["a1a2", "b1", "c1"]), + Index(["a1a2", "b1", "c1"], name='xxx')]: + + res = idx.str.extractall("[ab](?P\d)") + tm.assert_frame_equal(res, exp) + + s = Series(["a1a2", "b1", "c1"], name='s_name', + index=Index(["XX", "yy", "zz"], name='idx_name')) + res = s.str.extractall("[ab](?P\d)") + exp_idx = MultiIndex.from_tuples([("XX", 0), ("XX", 1), ("yy", 0)], + names=["idx_name", 'match']) + exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx) + tm.assert_frame_equal(res, exp) + def test_extractall_errors(self): # Does not make sense to use extractall with a regex that has # no capture groups. (it returns DataFrame with one column for @@ -991,8 +1015,8 @@ def test_extractall_errors(self): s.str.extractall(r'[a-z]') def test_extract_index_one_two_groups(self): - s = Series( - ['a3', 'b3', 'd4c2'], ["A3", "B3", "D4"], name='series_name') + s = Series(['a3', 'b3', 'd4c2'], index=["A3", "B3", "D4"], + name='series_name') r = s.index.str.extract(r'([A-Z])', expand=True) e = DataFrame(['A', "B", "D"]) tm.assert_frame_equal(r, e)