Skip to content

Commit 4b68a72

Browse files
committed
ENH: add StringMethods (.str accessor) to Index, fixes pandas-dev#9068
1 parent 10c933b commit 4b68a72

File tree

7 files changed

+104
-24
lines changed

7 files changed

+104
-24
lines changed

doc/source/text.rst

+9-2
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@ Working with Text Data
1717

1818
.. _text.string_methods:
1919

20-
Series is equipped with a set of string processing methods
20+
Series and Index are equipped with a set of string processing methods
2121
that make it easy to operate on each element of the array. Perhaps most
2222
importantly, these methods exclude missing/NA values automatically. These are
23-
accessed via the Series's ``str`` attribute and generally have names matching
23+
accessed via the ``str`` attribute and generally have names matching
2424
the equivalent (scalar) built-in string methods:
2525

2626
.. ipython:: python
@@ -30,6 +30,13 @@ the equivalent (scalar) built-in string methods:
3030
s.str.upper()
3131
s.str.len()
3232
33+
.. ipython:: python
34+
35+
idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
36+
idx.str.strip()
37+
idx.str.lstrip()
38+
idx.str.rstrip()
39+
3340
Splitting and Replacing Strings
3441
-------------------------------
3542

doc/source/whatsnew/v0.16.1.txt

+15
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,26 @@ Enhancements
1818
~~~~~~~~~~~~
1919

2020
- Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
21+
- Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)
2122

23+
The `.str` accessor is now available for both `Series` and `Index`.
2224

25+
.. ipython:: python
2326

27+
idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
28+
idx.str.strip()
2429

30+
One special case for the `.str` accessor on `Index` is that if a string method returns `bool`, the `.str` accessor
31+
will return a `np.array` instead of a boolean `Index` (:issue:`8875`). This enables the following expression
32+
to work naturally:
2533

34+
.. ipython:: python
35+
36+
idx = Index(['a1', 'a2', 'b1', 'b2'])
37+
s = Series(range(4), index=idx)
38+
s
39+
idx.str.startswith('a')
40+
s[s.index.str.startswith('a')]
2641

2742
.. _whatsnew_0161.api:
2843

pandas/core/base.py

+19
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import pandas.tslib as tslib
1111
import pandas.lib as lib
1212
from pandas.util.decorators import Appender, cache_readonly
13+
from pandas.core.strings import StringMethods
1314

1415

1516
_shared_docs = dict()
@@ -497,6 +498,24 @@ def searchsorted(self, key, side='left'):
497498
#### needs tests/doc-string
498499
return self.values.searchsorted(key, side=side)
499500

501+
# string methods
502+
def _make_str_accessor(self):
503+
from pandas.core.series import Series
504+
from pandas.core.index import Index
505+
if isinstance(self, Series) and not com.is_object_dtype(self.dtype):
506+
# this really should exclude all series with any non-string values,
507+
# but that isn't practical for performance reasons until we have a
508+
# str dtype (GH 9343)
509+
raise AttributeError("Can only use .str accessor with string "
510+
"values, which use np.object_ dtype in "
511+
"pandas")
512+
elif isinstance(self, Index) and self.inferred_type != 'string':
513+
raise AttributeError("Can only use .str accessor with string "
514+
"values (i.e. inferred_type is 'string')")
515+
return StringMethods(self)
516+
517+
str = AccessorProperty(StringMethods, _make_str_accessor)
518+
500519
_shared_docs['drop_duplicates'] = (
501520
"""Return %(klass)s with duplicate values removed
502521

pandas/core/series.py

-16
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
from pandas.core import generic, base
2929
from pandas.core.internals import SingleBlockManager
3030
from pandas.core.categorical import Categorical, CategoricalAccessor
31-
from pandas.core.strings import StringMethods
3231
from pandas.tseries.common import (maybe_to_datetimelike,
3332
CombinedDatetimelikeProperties)
3433
from pandas.tseries.index import DatetimeIndex
@@ -2494,21 +2493,6 @@ def to_period(self, freq=None, copy=True):
24942493
return self._constructor(new_values,
24952494
index=new_index).__finalize__(self)
24962495

2497-
#------------------------------------------------------------------------------
2498-
# string methods
2499-
2500-
def _make_str_accessor(self):
2501-
if not com.is_object_dtype(self.dtype):
2502-
# this really should exclude all series with any non-string values,
2503-
# but that isn't practical for performance reasons until we have a
2504-
# str dtype (GH 9343)
2505-
raise AttributeError("Can only use .str accessor with string "
2506-
"values, which use np.object_ dtype in "
2507-
"pandas")
2508-
return StringMethods(self)
2509-
2510-
str = base.AccessorProperty(StringMethods, _make_str_accessor)
2511-
25122496
#------------------------------------------------------------------------------
25132497
# Datetimelike delegation methods
25142498

pandas/core/strings.py

+17-6
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import numpy as np
22

33
from pandas.compat import zip
4-
from pandas.core.common import isnull, _values_from_object
4+
from pandas.core.common import isnull, _values_from_object, is_bool_dtype
55
import pandas.compat as compat
66
from pandas.util.decorators import Appender
77
import re
@@ -632,9 +632,9 @@ def str_split(arr, pat=None, n=None, return_type='series'):
632632
pat : string, default None
633633
String or regular expression to split on. If None, splits on whitespace
634634
n : int, default None (all)
635-
return_type : {'series', 'frame'}, default 'series
635+
return_type : {'series', 'frame'}, default 'series'
636636
If frame, returns a DataFrame (elements are strings)
637-
If series, returns an Series (elements are lists of strings).
637+
If series, returns a Series (elements are lists of strings).
638638
639639
Notes
640640
-----
@@ -646,9 +646,13 @@ def str_split(arr, pat=None, n=None, return_type='series'):
646646
"""
647647
from pandas.core.series import Series
648648
from pandas.core.frame import DataFrame
649+
from pandas.core.index import Index
649650

650651
if return_type not in ('series', 'frame'):
651652
raise ValueError("return_type must be {'series', 'frame'}")
653+
if return_type == 'frame' and isinstance(arr, Index):
654+
raise ValueError("return_type='frame' is not supported for string "
655+
"methods on Index")
652656
if pat is None:
653657
if n is None or n == 0:
654658
n = -1
@@ -926,9 +930,9 @@ def do_copy(target):
926930
class StringMethods(object):
927931

928932
"""
929-
Vectorized string functions for Series. NAs stay NA unless handled
930-
otherwise by a particular method. Patterned after Python's string methods,
931-
with some inspiration from R's stringr package.
933+
Vectorized string functions for Series and Index. NAs stay NA unless
934+
handled otherwise by a particular method. Patterned after Python's string
935+
methods, with some inspiration from R's stringr package.
932936
933937
Examples
934938
--------
@@ -957,11 +961,18 @@ def __iter__(self):
957961
def _wrap_result(self, result):
958962
from pandas.core.series import Series
959963
from pandas.core.frame import DataFrame
964+
from pandas.core.index import Index
960965

961966
if not hasattr(result, 'ndim'):
962967
return result
963968
elif result.ndim == 1:
964969
name = getattr(result, 'name', None)
970+
if isinstance(self.series, Index):
971+
# if result is a boolean np.array, return the np.array
972+
# instead of wrapping it into a boolean Index (GH 8875)
973+
if is_bool_dtype(result):
974+
return result
975+
return Index(result, name=name or self.series.name)
965976
return Series(result, index=self.series.index,
966977
name=name or self.series.name)
967978
else:

pandas/tests/test_index.py

+31
Original file line numberDiff line numberDiff line change
@@ -1174,6 +1174,37 @@ def test_join_self(self):
11741174
for kind in kinds:
11751175
joined = res.join(res, how=kind)
11761176
self.assertIs(res, joined)
1177+
def test_str_attribute(self):
1178+
# GH9068
1179+
methods = ['strip', 'rstrip', 'lstrip']
1180+
idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
1181+
for method in methods:
1182+
expected = Index([getattr(str, method)(x) for x in idx.values])
1183+
tm.assert_index_equal(getattr(Index.str, method)(idx.str), expected)
1184+
1185+
# create a few instances that are not able to use .str accessor
1186+
indices = [Index(range(5)),
1187+
tm.makeDateIndex(10),
1188+
MultiIndex.from_tuples([('foo', '1'), ('bar', '3')]),
1189+
PeriodIndex(start='2000', end='2010', freq='A')]
1190+
for idx in indices:
1191+
with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'):
1192+
idx.str.repeat(2)
1193+
1194+
idx = Index(['a b c', 'd e', 'f'])
1195+
expected = Index([['a', 'b', 'c'], ['d', 'e'], ['f']])
1196+
tm.assert_index_equal(idx.str.split(), expected)
1197+
with self.assertRaisesRegexp(ValueError, 'not supported'):
1198+
idx.str.split(return_type='frame')
1199+
1200+
# test boolean case, should return np.array instead of boolean Index
1201+
idx = Index(['a1', 'a2', 'b1', 'b2'])
1202+
expected = np.array([True, True, False, False])
1203+
self.assert_array_equal(idx.str.startswith('a'), expected)
1204+
self.assertIsInstance(idx.str.startswith('a'), np.ndarray)
1205+
s = Series(range(4), index=idx)
1206+
expected = Series(range(2), index=['a1', 'a2'])
1207+
tm.assert_series_equal(s[s.index.str.startswith('a')], expected)
11771208

11781209
def test_indexing_doesnt_change_class(self):
11791210
idx = Index([1, 2, 3, 'a', 'b', 'c'])

pandas/tests/test_series.py

+13
Original file line numberDiff line numberDiff line change
@@ -4855,6 +4855,19 @@ def test_to_csv_path_is_none(self):
48554855
csv_str = s.to_csv(path=None)
48564856
self.assertIsInstance(csv_str, str)
48574857

4858+
def test_str_attribute(self):
4859+
# GH9068
4860+
methods = ['strip', 'rstrip', 'lstrip']
4861+
s = Series([' jack', 'jill ', ' jesse ', 'frank'])
4862+
for method in methods:
4863+
expected = Series([getattr(str, method)(x) for x in s.values])
4864+
assert_series_equal(getattr(Series.str, method)(s.str), expected)
4865+
4866+
# str accessor only valid with string values
4867+
s = Series(range(5))
4868+
with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'):
4869+
s.str.repeat(2)
4870+
48584871
def test_clip(self):
48594872
val = self.ts.median()
48604873

0 commit comments

Comments
 (0)