3
3
from pandas .compat import zip
4
4
from pandas .core .common import isnull , _values_from_object , is_bool_dtype
5
5
import pandas .compat as compat
6
- from pandas .util .decorators import Appender
6
+ from pandas .util .decorators import Appender , deprecate_kwarg
7
7
import re
8
8
import pandas .lib as lib
9
9
import warnings
@@ -638,6 +638,26 @@ def str_find(arr, sub, start=0, end=None, side='left'):
638
638
return _na_map (f , arr , dtype = int )
639
639
640
640
641
+ def str_index (arr , sub , start = 0 , end = None , side = 'left' ):
642
+ if not isinstance (sub , compat .string_types ):
643
+ msg = 'expected a string object, not {0}'
644
+ raise TypeError (msg .format (type (sub ).__name__ ))
645
+
646
+ if side == 'left' :
647
+ method = 'index'
648
+ elif side == 'right' :
649
+ method = 'rindex'
650
+ else : # pragma: no cover
651
+ raise ValueError ('Invalid side' )
652
+
653
+ if end is None :
654
+ f = lambda x : getattr (x , method )(sub , start )
655
+ else :
656
+ f = lambda x : getattr (x , method )(sub , start , end )
657
+
658
+ return _na_map (f , arr , dtype = int )
659
+
660
+
641
661
def str_pad (arr , width , side = 'left' , fillchar = ' ' ):
642
662
"""
643
663
Pad strings in the Series/Index with an additional character to
@@ -676,7 +696,7 @@ def str_pad(arr, width, side='left', fillchar=' '):
676
696
return _na_map (f , arr )
677
697
678
698
679
- def str_split (arr , pat = None , n = None , return_type = 'series' ):
699
+ def str_split (arr , pat = None , n = None ):
680
700
"""
681
701
Split each string (a la re.split) in the Series/Index by given
682
702
pattern, propagating NA values. Equivalent to :meth:`str.split`.
@@ -685,29 +705,17 @@ def str_split(arr, pat=None, n=None, return_type='series'):
685
705
----------
686
706
pat : string, default None
687
707
String or regular expression to split on. If None, splits on whitespace
688
- n : int, default None (all)
689
- return_type : {'series', 'index', 'frame'}, default 'series'
690
- If frame, returns a DataFrame (elements are strings)
691
- If series or index, returns the same type as the original object
692
- (elements are lists of strings).
693
-
694
- Notes
695
- -----
696
- Both 0 and -1 will be interpreted as return all splits
708
+ n : int, default -1 (all)
709
+ None, 0 and -1 will be interpreted as return all splits
710
+ expand : bool, default False
711
+ * If True, return DataFrame/MultiIndex expanding dimensionality.
712
+ * If False, return Series/Index.
713
+ return_type : deprecated, use `expand`
697
714
698
715
Returns
699
716
-------
700
- split : Series/Index of objects or DataFrame
717
+ split : Series/Index or DataFrame/MultiIndex of objects
701
718
"""
702
- from pandas .core .series import Series
703
- from pandas .core .frame import DataFrame
704
- from pandas .core .index import Index
705
-
706
- if return_type not in ('series' , 'index' , 'frame' ):
707
- raise ValueError ("return_type must be {'series', 'index', 'frame'}" )
708
- if return_type == 'frame' and isinstance (arr , Index ):
709
- raise ValueError ("return_type='frame' is not supported for string "
710
- "methods on Index" )
711
719
if pat is None :
712
720
if n is None or n == 0 :
713
721
n = - 1
@@ -722,10 +730,7 @@ def str_split(arr, pat=None, n=None, return_type='series'):
722
730
n = 0
723
731
regex = re .compile (pat )
724
732
f = lambda x : regex .split (x , maxsplit = n )
725
- if return_type == 'frame' :
726
- res = DataFrame ((x for x in _na_map (f , arr )), index = arr .index )
727
- else :
728
- res = _na_map (f , arr )
733
+ res = _na_map (f , arr )
729
734
return res
730
735
731
736
@@ -808,7 +813,7 @@ def str_strip(arr, to_strip=None, side='both'):
808
813
809
814
810
815
def str_wrap (arr , width , ** kwargs ):
811
- """
816
+ r """
812
817
Wrap long strings in the Series/Index to be formatted in
813
818
paragraphs with length less than a given width.
814
819
@@ -870,6 +875,44 @@ def str_wrap(arr, width, **kwargs):
870
875
return _na_map (lambda s : '\n ' .join (tw .wrap (s )), arr )
871
876
872
877
878
+ def str_translate (arr , table , deletechars = None ):
879
+ """
880
+ Map all characters in the string through the given mapping table.
881
+ Equivalent to standard :meth:`str.translate`. Note that the optional
882
+ argument deletechars is only valid if you are using python 2. For python 3,
883
+ character deletion should be specified via the table argument.
884
+
885
+ Parameters
886
+ ----------
887
+ table : dict (python 3), str or None (python 2)
888
+ In python 3, table is a mapping of Unicode ordinals to Unicode ordinals,
889
+ strings, or None. Unmapped characters are left untouched. Characters
890
+ mapped to None are deleted. :meth:`str.maketrans` is a helper function
891
+ for making translation tables.
892
+ In python 2, table is either a string of length 256 or None. If the
893
+ table argument is None, no translation is applied and the operation
894
+ simply removes the characters in deletechars. :func:`string.maketrans`
895
+ is a helper function for making translation tables.
896
+ deletechars : str, optional (python 2)
897
+ A string of characters to delete. This argument is only valid
898
+ in python 2.
899
+
900
+ Returns
901
+ -------
902
+ translated : Series/Index of objects
903
+ """
904
+ if deletechars is None :
905
+ f = lambda x : x .translate (table )
906
+ else :
907
+ from pandas import compat
908
+ if compat .PY3 :
909
+ raise ValueError ("deletechars is not a valid argument for "
910
+ "str.translate in python 3. You should simply "
911
+ "specify character deletions in the table argument" )
912
+ f = lambda x : x .translate (table , deletechars )
913
+ return _na_map (f , arr )
914
+
915
+
873
916
def str_get (arr , i ):
874
917
"""
875
918
Extract element from lists, tuples, or strings in each element in the
@@ -1001,6 +1044,7 @@ def __iter__(self):
1001
1044
g = self .get (i )
1002
1045
1003
1046
def _wrap_result (self , result , ** kwargs ):
1047
+
1004
1048
# leave as it is to keep extract and get_dummies results
1005
1049
# can be merged to _wrap_result_expand in v0.17
1006
1050
from pandas .core .series import Series
@@ -1024,7 +1068,10 @@ def _wrap_result(self, result, **kwargs):
1024
1068
return DataFrame (result , index = self .series .index )
1025
1069
1026
1070
def _wrap_result_expand (self , result , expand = False ):
1027
- from pandas .core .index import Index
1071
+ if not isinstance (expand , bool ):
1072
+ raise ValueError ("expand must be True or False" )
1073
+
1074
+ from pandas .core .index import Index , MultiIndex
1028
1075
if not hasattr (result , 'ndim' ):
1029
1076
return result
1030
1077
@@ -1037,7 +1084,9 @@ def _wrap_result_expand(self, result, expand=False):
1037
1084
1038
1085
if expand :
1039
1086
result = list (result )
1040
- return Index (result , name = name )
1087
+ return MultiIndex .from_tuples (result , names = name )
1088
+ else :
1089
+ return Index (result , name = name )
1041
1090
else :
1042
1091
index = self .series .index
1043
1092
if expand :
@@ -1054,10 +1103,12 @@ def cat(self, others=None, sep=None, na_rep=None):
1054
1103
result = str_cat (self .series , others = others , sep = sep , na_rep = na_rep )
1055
1104
return self ._wrap_result (result )
1056
1105
1106
+ @deprecate_kwarg ('return_type' , 'expand' ,
1107
+ mapping = {'series' : False , 'frame' : True })
1057
1108
@copy (str_split )
1058
- def split (self , pat = None , n = - 1 , return_type = 'series' ):
1059
- result = str_split (self .series , pat , n = n , return_type = return_type )
1060
- return self ._wrap_result (result )
1109
+ def split (self , pat = None , n = - 1 , expand = False ):
1110
+ result = str_split (self .series , pat , n = n )
1111
+ return self ._wrap_result_expand (result , expand = expand )
1061
1112
1062
1113
_shared_docs ['str_partition' ] = ("""
1063
1114
Split the string at the %(side)s occurrence of `sep`, and return 3 elements
@@ -1071,7 +1122,7 @@ def split(self, pat=None, n=-1, return_type='series'):
1071
1122
String to split on.
1072
1123
expand : bool, default True
1073
1124
* If True, return DataFrame/MultiIndex expanding dimensionality.
1074
- * If False, return Series/Index
1125
+ * If False, return Series/Index.
1075
1126
1076
1127
Returns
1077
1128
-------
@@ -1261,6 +1312,11 @@ def get_dummies(self, sep='|'):
1261
1312
result = str_get_dummies (self .series , sep )
1262
1313
return self ._wrap_result (result )
1263
1314
1315
+ @copy (str_translate )
1316
+ def translate (self , table , deletechars = None ):
1317
+ result = str_translate (self .series , table , deletechars )
1318
+ return self ._wrap_result (result )
1319
+
1264
1320
count = _pat_wrapper (str_count , flags = True )
1265
1321
startswith = _pat_wrapper (str_startswith , na = True )
1266
1322
endswith = _pat_wrapper (str_endswith , na = True )
@@ -1325,6 +1381,42 @@ def normalize(self, form):
1325
1381
result = _na_map (f , self .series )
1326
1382
return self ._wrap_result (result )
1327
1383
1384
+ _shared_docs ['index' ] = ("""
1385
+ Return %(side)s indexes in each strings where the substring is
1386
+ fully contained between [start:end]. This is the same as ``str.%(similar)s``
1387
+ except instead of returning -1, it raises a ValueError when the substring
1388
+ is not found. Equivalent to standard ``str.%(method)s``.
1389
+
1390
+ Parameters
1391
+ ----------
1392
+ sub : str
1393
+ Substring being searched
1394
+ start : int
1395
+ Left edge index
1396
+ end : int
1397
+ Right edge index
1398
+
1399
+ Returns
1400
+ -------
1401
+ found : Series/Index of objects
1402
+
1403
+ See Also
1404
+ --------
1405
+ %(also)s
1406
+ """ )
1407
+
1408
+ @Appender (_shared_docs ['index' ] % dict (side = 'lowest' , similar = 'find' , method = 'index' ,
1409
+ also = 'rindex : Return highest indexes in each strings' ))
1410
+ def index (self , sub , start = 0 , end = None ):
1411
+ result = str_index (self .series , sub , start = start , end = end , side = 'left' )
1412
+ return self ._wrap_result (result )
1413
+
1414
+ @Appender (_shared_docs ['index' ] % dict (side = 'highest' , similar = 'rfind' , method = 'rindex' ,
1415
+ also = 'index : Return lowest indexes in each strings' ))
1416
+ def rindex (self , sub , start = 0 , end = None ):
1417
+ result = str_index (self .series , sub , start = start , end = end , side = 'right' )
1418
+ return self ._wrap_result (result )
1419
+
1328
1420
_shared_docs ['len' ] = ("""
1329
1421
Compute length of each string in the Series/Index.
1330
1422
0 commit comments