Skip to content

Commit 351ba92

Browse files
committed
Merge pull request #4696 from danielballan/str_extract
ENH: Series.str.extract returns regex matches more conveniently
2 parents 42c4bef + 49eb724 commit 351ba92

File tree

5 files changed

+219
-2
lines changed

5 files changed

+219
-2
lines changed

doc/source/basics.rst

+37
Original file line numberDiff line numberDiff line change
@@ -975,6 +975,42 @@ Methods like ``replace`` and ``findall`` take regular expressions, too:
975975
s3
976976
s3.str.replace('^.a|dog', 'XX-XX ', case=False)
977977
978+
The method ``match`` returns the groups in a regular expression in one tuple.
979+
Starting in pandas version 0.13, the method ``extract`` is available to
980+
accomplish this more conveniently.
981+
982+
Extracting a regular expression with one group returns a Series of strings.
983+
984+
.. ipython:: python
985+
986+
Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)')
987+
988+
Elements that do not match return ``NaN``. Extracting a regular expression
989+
with more than one group returns a DataFrame with one column per group.
990+
991+
.. ipython:: python
992+
993+
Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)')
994+
995+
Elements that do not match return a row of ``NaN``s.
996+
Thus, a Series of messy strings can be "converted" into a
997+
like-indexed Series or DataFrame of cleaned-up or more useful strings,
998+
without necessitating ``get()`` to access tuples or ``re.match`` objects.
999+
1000+
Named groups like
1001+
1002+
.. ipython:: python
1003+
1004+
Series(['a1', 'b2', 'c3']).str.match('(?P<letter>[ab])(?P<digit>\d)')
1005+
1006+
and optional groups like
1007+
1008+
.. ipython:: python
1009+
1010+
Series(['a1', 'b2', '3']).str.match('(?P<letter>[ab])?(?P<digit>\d)')
1011+
1012+
can also be used.
1013+
9781014
Methods like ``contains``, ``startswith``, and ``endswith`` takes an extra
9791015
``na`` arguement so missing values can be considered True or False:
9801016

@@ -1003,6 +1039,7 @@ Methods like ``contains``, ``startswith``, and ``endswith`` takes an extra
10031039
``endswidth``,Equivalent to ``str.endswith(pat)`` for each element
10041040
``findall``,Compute list of all occurrences of pattern/regex for each string
10051041
``match``,"Call ``re.match`` on each element, returning matched groups as list"
1042+
``extract``,"Call ``re.match`` on each element, as ``match`` does, but return matched groups as strings for convenience."
10061043
``len``,Compute string lengths
10071044
``strip``,Equivalent to ``str.strip``
10081045
``rstrip``,Equivalent to ``str.rstrip``

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ New features
5959

6060
- Added ``isin`` method to DataFrame (:issue:`4211`)
6161
- Clipboard functionality now works with PySide (:issue:`4282`)
62+
- New ``extract`` string method returns regex matches more conveniently (:issue:`4685`)
6263

6364
Improvements to existing features
6465
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

doc/source/v0.13.0.txt

+32
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,38 @@ Enhancements
297297
the bandwidth, and to gkde.evaluate() to specify the indicies at which it
298298
is evaluated, respecttively. See scipy docs.
299299
- DataFrame constructor now accepts a numpy masked record array (:issue:`3478`)
300+
- The new vectorized string method ``extract`` return regular expression
301+
matches more conveniently.
302+
303+
.. ipython:: python
304+
305+
Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)')
306+
307+
Elements that do not match return ``NaN``. Extracting a regular expression
308+
with more than one group returns a DataFrame with one column per group.
309+
310+
311+
.. ipython:: python
312+
313+
Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)')
314+
315+
Elements that do not match return a row of ``NaN``s.
316+
Thus, a Series of messy strings can be "converted" into a
317+
like-indexed Series or DataFrame of cleaned-up or more useful strings,
318+
without necessitating ``get()`` to access tuples or ``re.match`` objects.
319+
320+
Named groups like
321+
322+
.. ipython:: python
323+
324+
Series(['a1', 'b2', 'c3']).str.match('(?P<letter>[ab])(?P<digit>\d)')
325+
326+
and optional groups like
327+
328+
.. ipython:: python
329+
Series(['a1', 'b2', '3']).str.match('(?P<letter>[ab])?(?P<digit>\d)')
330+
331+
can also be used.
300332

301333

302334
.. _whatsnew_0130.experimental:

pandas/core/strings.py

+61-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from pandas.compat import zip
44
from pandas.core.common import isnull, _values_from_object
55
from pandas.core.series import Series
6+
from pandas.core.frame import DataFrame
67
import pandas.compat as compat
78
import re
89
import pandas.lib as lib
@@ -328,6 +329,59 @@ def f(x):
328329

329330
return _na_map(f, arr)
330331

332+
def str_extract(arr, pat, flags=0):
333+
"""
334+
Find groups in each string (from beginning) using passed regular expression
335+
336+
Parameters
337+
----------
338+
pat : string
339+
Pattern or regular expression
340+
flags : int, default 0 (no flags)
341+
re module flags, e.g. re.IGNORECASE
342+
343+
Returns
344+
-------
345+
extracted groups : Series (one group) or DataFrame (multiple groups)
346+
347+
348+
Note
349+
----
350+
Compare to the string method match, which returns re.match objects.
351+
"""
352+
regex = re.compile(pat, flags=flags)
353+
354+
# just to be safe, check this
355+
if regex.groups == 0:
356+
raise ValueError("This pattern contains no groups to capture.")
357+
elif regex.groups == 1:
358+
def f(x):
359+
if not isinstance(x, compat.string_types):
360+
return None
361+
m = regex.match(x)
362+
if m:
363+
return m.groups()[0] # may be None
364+
else:
365+
return None
366+
else:
367+
empty_row = Series(regex.groups*[None])
368+
def f(x):
369+
if not isinstance(x, compat.string_types):
370+
return empty_row
371+
m = regex.match(x)
372+
if m:
373+
return Series(list(m.groups())) # may contain None
374+
else:
375+
return empty_row
376+
result = arr.apply(f)
377+
result.replace({None: np.nan}, inplace=True)
378+
if regex.groups > 1:
379+
result = DataFrame(result) # Don't rely on the wrapper; name columns.
380+
names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
381+
result.columns = [names.get(1 + i, i) for i in range(regex.groups)]
382+
else:
383+
result.name = regex.groupindex.get(0)
384+
return result
331385

332386
def str_join(arr, sep):
333387
"""
@@ -675,8 +729,12 @@ def __iter__(self):
675729
g = self.get(i)
676730

677731
def _wrap_result(self, result):
678-
return Series(result, index=self.series.index,
679-
name=self.series.name)
732+
assert result.ndim < 3
733+
if result.ndim == 1:
734+
return Series(result, index=self.series.index,
735+
name=self.series.name)
736+
else:
737+
return DataFrame(result, index=self.series.index)
680738

681739
@copy(str_cat)
682740
def cat(self, others=None, sep=None, na_rep=None):
@@ -764,6 +822,7 @@ def rstrip(self, to_strip=None):
764822
endswith = _pat_wrapper(str_endswith, na=True)
765823
findall = _pat_wrapper(str_findall, flags=True)
766824
match = _pat_wrapper(str_match, flags=True)
825+
extract = _pat_wrapper(str_extract, flags=True)
767826

768827
len = _noarg_wrapper(str_len)
769828
lower = _noarg_wrapper(str_lower)

pandas/tests/test_strings.py

+88
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,94 @@ def test_match(self):
415415
exp = Series([(u('BAD__'), u('BAD')), NA, []])
416416
tm.assert_series_equal(result, exp)
417417

418+
def test_extract(self):
419+
# Contains tests like those in test_match and some others.
420+
421+
values = Series(['fooBAD__barBAD', NA, 'foo'])
422+
er = [NA, NA] # empty row
423+
424+
result = values.str.extract('.*(BAD[_]+).*(BAD)')
425+
exp = DataFrame([['BAD__', 'BAD'], er, er])
426+
tm.assert_frame_equal(result, exp)
427+
428+
# mixed
429+
mixed = Series(['aBAD_BAD', NA, 'BAD_b_BAD', True, datetime.today(),
430+
'foo', None, 1, 2.])
431+
432+
rs = Series(mixed).str.extract('.*(BAD[_]+).*(BAD)')
433+
exp = DataFrame([['BAD_', 'BAD'], er, ['BAD_', 'BAD'], er, er,
434+
er, er, er, er])
435+
tm.assert_frame_equal(rs, exp)
436+
437+
# unicode
438+
values = Series([u('fooBAD__barBAD'), NA, u('foo')])
439+
440+
result = values.str.extract('.*(BAD[_]+).*(BAD)')
441+
exp = DataFrame([[u('BAD__'), u('BAD')], er, er])
442+
tm.assert_frame_equal(result, exp)
443+
444+
# no groups
445+
s = Series(['A1', 'B2', 'C3'])
446+
f = lambda: s.str.extract('[ABC][123]')
447+
self.assertRaises(ValueError, f)
448+
449+
# only non-capturing groups
450+
f = lambda: s.str.extract('(?:[AB]).*')
451+
self.assertRaises(ValueError, f)
452+
453+
# one group, no matches
454+
result = s.str.extract('(_)')
455+
exp = Series([NA, NA, NA])
456+
tm.assert_series_equal(result, exp)
457+
458+
# two groups, no matches
459+
result = s.str.extract('(_)(_)')
460+
exp = DataFrame([[NA, NA], [NA, NA], [NA, NA]])
461+
tm.assert_frame_equal(result, exp)
462+
463+
# one group, some matches
464+
result = s.str.extract('([AB])[123]')
465+
exp = Series(['A', 'B', NA])
466+
tm.assert_series_equal(result, exp)
467+
468+
# two groups, some matches
469+
result = s.str.extract('([AB])([123])')
470+
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
471+
tm.assert_frame_equal(result, exp)
472+
473+
# named group/groups
474+
result = s.str.extract('(?P<letter>[AB])(?P<number>[123])')
475+
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=['letter', 'number'])
476+
tm.assert_frame_equal(result, exp)
477+
result = s.str.extract('(?P<letter>[AB])')
478+
exp = Series(['A', 'B', NA], name='letter')
479+
tm.assert_series_equal(result, exp)
480+
481+
# mix named and unnamed groups
482+
result = s.str.extract('([AB])(?P<number>[123])')
483+
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]], columns=[0, 'number'])
484+
tm.assert_frame_equal(result, exp)
485+
486+
# one normal group, one non-capturing group
487+
result = s.str.extract('([AB])(?:[123])')
488+
exp = Series(['A', 'B', NA])
489+
tm.assert_series_equal(result, exp)
490+
491+
# two normal groups, one non-capturing group
492+
result = Series(['A11', 'B22', 'C33']).str.extract('([AB])([123])(?:[123])')
493+
exp = DataFrame([['A', '1'], ['B', '2'], [NA, NA]])
494+
tm.assert_frame_equal(result, exp)
495+
496+
# one optional group followed by one normal group
497+
result = Series(['A1', 'B2', '3']).str.extract('(?P<letter>[AB])?(?P<number>[123])')
498+
exp = DataFrame([['A', '1'], ['B', '2'], [NA, '3']], columns=['letter', 'number'])
499+
tm.assert_frame_equal(result, exp)
500+
501+
# one normal group followed by one optional group
502+
result = Series(['A1', 'B2', 'C']).str.extract('(?P<letter>[ABC])(?P<number>[123])?')
503+
exp = DataFrame([['A', '1'], ['B', '2'], ['C', NA]], columns=['letter', 'number'])
504+
tm.assert_frame_equal(result, exp)
505+
418506
def test_join(self):
419507
values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])
420508
result = values.str.split('_').str.join('_')

0 commit comments

Comments
 (0)