diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 38ba0b064c192..1b2c80f90f97b 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1029,7 +1029,7 @@ with more than one group returns a DataFrame with one column per group. Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)') -Elements that do not match return a row of ``NaN``s. +Elements that do not match return a row filled with ``NaN``. Thus, a Series of messy strings can be "converted" into a like-indexed Series or DataFrame of cleaned-up or more useful strings, without necessitating ``get()`` to access tuples or ``re.match`` objects. @@ -1051,18 +1051,35 @@ can also be used. Testing for Strings that Match or Contain a Pattern ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In previous versions, *extracting* match groups was accomplished by ``match``, -which returned a not-so-convenient Series of tuples. Starting in version 0.14, -the default behavior of match will change. It will return a boolean -indexer, analagous to the method ``contains``. -The distinction between -``match`` and ``contains`` is strictness: ``match`` relies on -strict ``re.match`` while ``contains`` relies on ``re.search``. +You can check whether elements contain a pattern: -In version 0.13, ``match`` performs its old, deprecated behavior by default, -but the new behavior is availabe through the keyword argument -``as_indexer=True``. +.. ipython:: python + + pattern = r'[a-z][0-9]' + Series(['1', '2', '3a', '3b', '03c']).contains(pattern) + +or match a pattern: + + +.. ipython:: python + + Series(['1', '2', '3a', '3b', '03c']).match(pattern, as_indexer=True) + +The distinction between ``match`` and ``contains`` is strictness: ``match`` +relies on strict ``re.match``, while ``contains`` relies on ``re.search``. + +.. warning:: + + In previous versions, ``match`` was for *extracting* groups, + returning a not-so-convenient Series of tuples. The new method ``extract`` + (described in the previous section) is now preferred. + + This old, deprecated behavior of ``match`` is still the default. As + demonstrated above, use the new behavior by setting ``as_indexer=True``. + In this mode, ``match`` is analagous to ``contains``, returning a boolean + Series. The new behavior will become the default behavior in a future + release. Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take an extra ``na`` arguement so missing values can be considered True or False: diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 3b1b220d3fac7..02f422bb0b635 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -164,6 +164,11 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan): Returns ------- + Series of boolean values + + See Also + -------- + match : analagous, but stricter, relying on re.match instead of re.search """ if not case: @@ -326,11 +331,22 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan, as_indexer=False): as_indexer : False, by default, gives deprecated behavior better achieved using str_extract. True return boolean indexer. + Returns + ------- + boolean Series + if as_indexer=True + Series of tuples + if as_indexer=False, default but deprecated Returns ------- - matches : boolean array (if as_indexer=True) - matches : array of tuples (if as_indexer=False, default but deprecated) + Series of boolean values + + See Also + -------- + contains : analagous, but less strict, relying on re.search instead of + re.match + extract : now preferred to the deprecated usage of match (as_indexer=False) Notes ----- @@ -385,10 +401,27 @@ def str_extract(arr, pat, flags=0): ------- extracted groups : Series (one group) or DataFrame (multiple groups) + Examples + -------- + A pattern with one group will return a Series. Non-matches will be NaN. - Notes - ----- - Compare to the string method match, which returns re.match objects. + >>> Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)') + 0 1 + 1 2 + 2 NaN + dtype: object + + A pattern with more than one group will return a DataFrame. + + >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)') + + A pattern may contain optional groups. + + >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])?(\d)') + + Named groups will become column names in the result. + + >>> Series(['a1', 'b2', 'c3']).str.extract('(?P[ab])(?P\d)') """ regex = re.compile(pat, flags=flags)