diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 493f299b2bf32..5c6a4f897e851 100755 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -41,9 +41,7 @@ Enhancements - ``DataFrame.diff`` now takes an ``axis`` parameter that determines the direction of differencing (:issue:`9727`) - Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`) - Added ``StringMethods.normalize()`` which behaves the same as standard :func:`unicodedata.normalizes` (:issue:`10031`) - - Added ``StringMethods.partition()`` and ``rpartition()`` which behave as the same as standard ``str`` (:issue:`9773`) -- Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s). The ``.str`` accessor is now available for both ``Series`` and ``Index``. @@ -65,6 +63,8 @@ Enhancements idx.str.startswith('a') s[s.index.str.startswith('a')] +- Improved ``extract`` and ``get_dummies`` methods for ``Index.str`` (:issue:`9980`) +- Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s). - ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`) - ``drop`` function can now accept ``errors`` keyword to suppress ``ValueError`` raised when any of label does not exist in the target data. (:issue:`6736`) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 62e9e0fbc41ae..6e603f60e02a2 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -466,6 +466,7 @@ def str_extract(arr, pat, flags=0): """ from pandas.core.series import Series from pandas.core.frame import DataFrame + from pandas.core.index import Index regex = re.compile(pat, flags=flags) # just to be safe, check this @@ -481,11 +482,14 @@ def f(x): return [np.nan if item is None else item for item in m.groups()] else: return empty_row + if regex.groups == 1: - result = Series([f(val)[0] for val in arr], - name=_get_single_group_name(regex), - index=arr.index, dtype=object) + result = np.array([f(val)[0] for val in arr], dtype=object) + name = _get_single_group_name(regex) else: + if isinstance(arr, Index): + raise ValueError("only one regex group is supported with Index") + name = None names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) columns = [names.get(1 + i, i) for i in range(regex.groups)] if arr.empty: @@ -495,7 +499,7 @@ def f(x): columns=columns, index=arr.index, dtype=object) - return result + return result, name def str_get_dummies(arr, sep='|'): @@ -531,6 +535,11 @@ def str_get_dummies(arr, sep='|'): pandas.get_dummies """ from pandas.core.frame import DataFrame + from pandas.core.index import Index + + # GH9980, Index.str does not support get_dummies() as it returns a frame + if isinstance(arr, Index): + raise TypeError("get_dummies is not supported for string methods on Index") # TODO remove this hack? arr = arr.fillna('') @@ -991,7 +1000,7 @@ def __iter__(self): i += 1 g = self.get(i) - def _wrap_result(self, result): + def _wrap_result(self, result, **kwargs): # leave as it is to keep extract and get_dummies results # can be merged to _wrap_result_expand in v0.17 from pandas.core.series import Series @@ -1000,16 +1009,16 @@ def _wrap_result(self, result): if not hasattr(result, 'ndim'): return result - elif result.ndim == 1: - name = getattr(result, 'name', None) + name = kwargs.get('name') or getattr(result, 'name', None) or self.series.name + + if result.ndim == 1: if isinstance(self.series, Index): # if result is a boolean np.array, return the np.array # instead of wrapping it into a boolean Index (GH 8875) if is_bool_dtype(result): return result - return Index(result, name=name or self.series.name) - return Series(result, index=self.series.index, - name=name or self.series.name) + return Index(result, name=name) + return Series(result, index=self.series.index, name=name) else: assert result.ndim < 3 return DataFrame(result, index=self.series.index) @@ -1257,7 +1266,11 @@ def get_dummies(self, sep='|'): startswith = _pat_wrapper(str_startswith, na=True) endswith = _pat_wrapper(str_endswith, na=True) findall = _pat_wrapper(str_findall, flags=True) - extract = _pat_wrapper(str_extract, flags=True) + + @copy(str_extract) + def extract(self, pat, flags=0): + result, name = str_extract(self.series, pat, flags=flags) + return self._wrap_result(result, name=name) _shared_docs['find'] = (""" Return %(side)s indexes in each strings in the Series/Index diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 1f84e1dc4d155..c9b11810d83fe 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -516,7 +516,6 @@ def test_match(self): def test_extract(self): # Contains tests like those in test_match and some others. - values = Series(['fooBAD__barBAD', NA, 'foo']) er = [NA, NA] # empty row @@ -540,15 +539,31 @@ def test_extract(self): exp = DataFrame([[u('BAD__'), u('BAD')], er, er]) tm.assert_frame_equal(result, exp) - # no groups - s = Series(['A1', 'B2', 'C3']) - f = lambda: s.str.extract('[ABC][123]') - self.assertRaises(ValueError, f) - - # only non-capturing groups - f = lambda: s.str.extract('(?:[AB]).*') - self.assertRaises(ValueError, f) + # GH9980 + # Index only works with one regex group since + # multi-group would expand to a frame + idx = Index(['A1', 'A2', 'A3', 'A4', 'B5']) + with tm.assertRaisesRegexp(ValueError, "supported"): + idx.str.extract('([AB])([123])') + + # these should work for both Series and Index + for klass in [Series, Index]: + # no groups + s_or_idx = klass(['A1', 'B2', 'C3']) + f = lambda: s_or_idx.str.extract('[ABC][123]') + self.assertRaises(ValueError, f) + + # only non-capturing groups + f = lambda: s_or_idx.str.extract('(?:[AB]).*') + self.assertRaises(ValueError, f) + + # single group renames series/index properly + s_or_idx = klass(['A1', 'A2']) + result = s_or_idx.str.extract(r'(?PA)\d') + tm.assert_equal(result.name, 'uno') + tm.assert_array_equal(result, klass(['A', 'A'])) + s = Series(['A1', 'B2', 'C3']) # one group, no matches result = s.str.extract('(_)') exp = Series([NA, NA, NA], dtype=object) @@ -569,14 +584,16 @@ def test_extract(self): exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]]) tm.assert_frame_equal(result, exp) - # named group/groups - result = s.str.extract('(?P[AB])(?P[123])') - exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number']) - tm.assert_frame_equal(result, exp) + # one named group result = s.str.extract('(?P[AB])') exp = Series(['A', 'B', NA], name='letter') tm.assert_series_equal(result, exp) + # two named groups + result = s.str.extract('(?P[AB])(?P[123])') + exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number']) + tm.assert_frame_equal(result, exp) + # mix named and unnamed groups result = s.str.extract('([AB])(?P[123])') exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=[0, 'number']) @@ -602,11 +619,6 @@ def test_extract(self): exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number']) tm.assert_frame_equal(result, exp) - # single group renames series properly - s = Series(['A1', 'A2']) - result = s.str.extract(r'(?PA)\d') - tm.assert_equal(result.name, 'uno') - # GH6348 # not passing index to the extractor def check_index(index): @@ -761,6 +773,12 @@ def test_get_dummies(self): columns=list('7ab')) tm.assert_frame_equal(result, expected) + # GH9980 + # Index.str does not support get_dummies() as it returns a frame + with tm.assertRaisesRegexp(TypeError, "not supported"): + idx = Index(['a|b', 'a|c', 'b|c']) + idx.str.get_dummies('|') + def test_join(self): values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h']) result = values.str.split('_').str.join('_')