Skip to content

Commit f98bcb8

Browse files
committed
ENH: add StringMethods (.str accessor) to Index, fixes #9068
1 parent 9b842a0 commit f98bcb8

File tree

7 files changed

+113
-27
lines changed

7 files changed

+113
-27
lines changed

doc/source/text.rst

+9-2
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@ Working with Text Data
1717

1818
.. _text.string_methods:
1919

20-
Series is equipped with a set of string processing methods
20+
Series and Index are equipped with a set of string processing methods
2121
that make it easy to operate on each element of the array. Perhaps most
2222
importantly, these methods exclude missing/NA values automatically. These are
23-
accessed via the Series's ``str`` attribute and generally have names matching
23+
accessed via the ``str`` attribute and generally have names matching
2424
the equivalent (scalar) built-in string methods:
2525

2626
.. ipython:: python
@@ -30,6 +30,13 @@ the equivalent (scalar) built-in string methods:
3030
s.str.upper()
3131
s.str.len()
3232
33+
.. ipython:: python
34+
35+
idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
36+
idx.str.strip()
37+
idx.str.lstrip()
38+
idx.str.rstrip()
39+
3340
Splitting and Replacing Strings
3441
-------------------------------
3542

doc/source/whatsnew/v0.16.1.txt

+18-1
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,28 @@ Enhancements
1818
~~~~~~~~~~~~
1919

2020
- Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
21+
- Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)
2122

22-
- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`)
23+
The `.str` accessor is now available for both `Series` and `Index`.
24+
25+
.. ipython:: python
2326

27+
idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
28+
idx.str.strip()
2429

30+
One special case for the `.str` accessor on `Index` is that if a string method returns `bool`, the `.str` accessor
31+
will return a `np.array` instead of a boolean `Index` (:issue:`8875`). This enables the following expression
32+
to work naturally:
2533

34+
.. ipython:: python
35+
36+
idx = Index(['a1', 'a2', 'b1', 'b2'])
37+
s = Series(range(4), index=idx)
38+
s
39+
idx.str.startswith('a')
40+
s[s.index.str.startswith('a')]
41+
42+
- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`)
2643
- ``drop`` function can now accept ``errors`` keyword to suppress ValueError raised when any of label does not exist in the target data. (:issue:`6736`)
2744

2845
.. ipython:: python

pandas/core/base.py

+19
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import pandas.tslib as tslib
1111
import pandas.lib as lib
1212
from pandas.util.decorators import Appender, cache_readonly
13+
from pandas.core.strings import StringMethods
1314

1415

1516
_shared_docs = dict()
@@ -497,6 +498,24 @@ def searchsorted(self, key, side='left'):
497498
#### needs tests/doc-string
498499
return self.values.searchsorted(key, side=side)
499500

501+
# string methods
502+
def _make_str_accessor(self):
503+
from pandas.core.series import Series
504+
from pandas.core.index import Index
505+
if isinstance(self, Series) and not com.is_object_dtype(self.dtype):
506+
# this really should exclude all series with any non-string values,
507+
# but that isn't practical for performance reasons until we have a
508+
# str dtype (GH 9343)
509+
raise AttributeError("Can only use .str accessor with string "
510+
"values, which use np.object_ dtype in "
511+
"pandas")
512+
elif isinstance(self, Index) and self.inferred_type != 'string':
513+
raise AttributeError("Can only use .str accessor with string "
514+
"values (i.e. inferred_type is 'string')")
515+
return StringMethods(self)
516+
517+
str = AccessorProperty(StringMethods, _make_str_accessor)
518+
500519
_shared_docs['drop_duplicates'] = (
501520
"""Return %(klass)s with duplicate values removed
502521

pandas/core/series.py

-16
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
from pandas.core import generic, base
2929
from pandas.core.internals import SingleBlockManager
3030
from pandas.core.categorical import Categorical, CategoricalAccessor
31-
from pandas.core.strings import StringMethods
3231
from pandas.tseries.common import (maybe_to_datetimelike,
3332
CombinedDatetimelikeProperties)
3433
from pandas.tseries.index import DatetimeIndex
@@ -2494,21 +2493,6 @@ def to_period(self, freq=None, copy=True):
24942493
return self._constructor(new_values,
24952494
index=new_index).__finalize__(self)
24962495

2497-
#------------------------------------------------------------------------------
2498-
# string methods
2499-
2500-
def _make_str_accessor(self):
2501-
if not com.is_object_dtype(self.dtype):
2502-
# this really should exclude all series with any non-string values,
2503-
# but that isn't practical for performance reasons until we have a
2504-
# str dtype (GH 9343)
2505-
raise AttributeError("Can only use .str accessor with string "
2506-
"values, which use np.object_ dtype in "
2507-
"pandas")
2508-
return StringMethods(self)
2509-
2510-
str = base.AccessorProperty(StringMethods, _make_str_accessor)
2511-
25122496
#------------------------------------------------------------------------------
25132497
# Datetimelike delegation methods
25142498

pandas/core/strings.py

+20-8
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import numpy as np
22

33
from pandas.compat import zip
4-
from pandas.core.common import isnull, _values_from_object
4+
from pandas.core.common import isnull, _values_from_object, is_bool_dtype
55
import pandas.compat as compat
66
from pandas.util.decorators import Appender
77
import re
@@ -632,9 +632,10 @@ def str_split(arr, pat=None, n=None, return_type='series'):
632632
pat : string, default None
633633
String or regular expression to split on. If None, splits on whitespace
634634
n : int, default None (all)
635-
return_type : {'series', 'frame'}, default 'series
635+
return_type : {'series', 'index', 'frame'}, default 'series'
636636
If frame, returns a DataFrame (elements are strings)
637-
If series, returns an Series (elements are lists of strings).
637+
If series or index, returns the same type as the original object
638+
(elements are lists of strings).
638639
639640
Notes
640641
-----
@@ -646,9 +647,13 @@ def str_split(arr, pat=None, n=None, return_type='series'):
646647
"""
647648
from pandas.core.series import Series
648649
from pandas.core.frame import DataFrame
650+
from pandas.core.index import Index
649651

650-
if return_type not in ('series', 'frame'):
651-
raise ValueError("return_type must be {'series', 'frame'}")
652+
if return_type not in ('series', 'index', 'frame'):
653+
raise ValueError("return_type must be {'series', 'index', 'frame'}")
654+
if return_type == 'frame' and isinstance(arr, Index):
655+
raise ValueError("return_type='frame' is not supported for string "
656+
"methods on Index")
652657
if pat is None:
653658
if n is None or n == 0:
654659
n = -1
@@ -928,9 +933,9 @@ def do_copy(target):
928933
class StringMethods(object):
929934

930935
"""
931-
Vectorized string functions for Series. NAs stay NA unless handled
932-
otherwise by a particular method. Patterned after Python's string methods,
933-
with some inspiration from R's stringr package.
936+
Vectorized string functions for Series and Index. NAs stay NA unless
937+
handled otherwise by a particular method. Patterned after Python's string
938+
methods, with some inspiration from R's stringr package.
934939
935940
Examples
936941
--------
@@ -959,11 +964,18 @@ def __iter__(self):
959964
def _wrap_result(self, result):
960965
from pandas.core.series import Series
961966
from pandas.core.frame import DataFrame
967+
from pandas.core.index import Index
962968

963969
if not hasattr(result, 'ndim'):
964970
return result
965971
elif result.ndim == 1:
966972
name = getattr(result, 'name', None)
973+
if isinstance(self.series, Index):
974+
# if result is a boolean np.array, return the np.array
975+
# instead of wrapping it into a boolean Index (GH 8875)
976+
if is_bool_dtype(result):
977+
return result
978+
return Index(result, name=name or self.series.name)
967979
return Series(result, index=self.series.index,
968980
name=name or self.series.name)
969981
else:

pandas/tests/test_index.py

+34
Original file line numberDiff line numberDiff line change
@@ -1197,6 +1197,40 @@ def test_join_self(self):
11971197
for kind in kinds:
11981198
joined = res.join(res, how=kind)
11991199
self.assertIs(res, joined)
1200+
def test_str_attribute(self):
1201+
# GH9068
1202+
methods = ['strip', 'rstrip', 'lstrip']
1203+
idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
1204+
for method in methods:
1205+
expected = Index([getattr(str, method)(x) for x in idx.values])
1206+
tm.assert_index_equal(getattr(Index.str, method)(idx.str), expected)
1207+
1208+
# create a few instances that are not able to use .str accessor
1209+
indices = [Index(range(5)),
1210+
tm.makeDateIndex(10),
1211+
MultiIndex.from_tuples([('foo', '1'), ('bar', '3')]),
1212+
PeriodIndex(start='2000', end='2010', freq='A')]
1213+
for idx in indices:
1214+
with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'):
1215+
idx.str.repeat(2)
1216+
1217+
idx = Index(['a b c', 'd e', 'f'])
1218+
expected = Index([['a', 'b', 'c'], ['d', 'e'], ['f']])
1219+
tm.assert_index_equal(idx.str.split(), expected)
1220+
tm.assert_index_equal(idx.str.split(return_type='series'), expected)
1221+
# return_type 'index' is an alias for 'series'
1222+
tm.assert_index_equal(idx.str.split(return_type='index'), expected)
1223+
with self.assertRaisesRegexp(ValueError, 'not supported'):
1224+
idx.str.split(return_type='frame')
1225+
1226+
# test boolean case, should return np.array instead of boolean Index
1227+
idx = Index(['a1', 'a2', 'b1', 'b2'])
1228+
expected = np.array([True, True, False, False])
1229+
self.assert_array_equal(idx.str.startswith('a'), expected)
1230+
self.assertIsInstance(idx.str.startswith('a'), np.ndarray)
1231+
s = Series(range(4), index=idx)
1232+
expected = Series(range(2), index=['a1', 'a2'])
1233+
tm.assert_series_equal(s[s.index.str.startswith('a')], expected)
12001234

12011235
def test_indexing_doesnt_change_class(self):
12021236
idx = Index([1, 2, 3, 'a', 'b', 'c'])

pandas/tests/test_series.py

+13
Original file line numberDiff line numberDiff line change
@@ -4933,6 +4933,19 @@ def test_to_csv_path_is_none(self):
49334933
csv_str = s.to_csv(path=None)
49344934
self.assertIsInstance(csv_str, str)
49354935

4936+
def test_str_attribute(self):
4937+
# GH9068
4938+
methods = ['strip', 'rstrip', 'lstrip']
4939+
s = Series([' jack', 'jill ', ' jesse ', 'frank'])
4940+
for method in methods:
4941+
expected = Series([getattr(str, method)(x) for x in s.values])
4942+
assert_series_equal(getattr(Series.str, method)(s.str), expected)
4943+
4944+
# str accessor only valid with string values
4945+
s = Series(range(5))
4946+
with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'):
4947+
s.str.repeat(2)
4948+
49364949
def test_clip(self):
49374950
val = self.ts.median()
49384951

0 commit comments

Comments
 (0)