PERF: increase performance of string split when expand=True

cgevans · cgevans · commit 4a2ac33ffb77 · 2015-05-25T15:50:53.000+03:00
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -3,7 +3,7 @@
 from pandas.compat import zip
 from pandas.core.common import isnull, _values_from_object, is_bool_dtype
 import pandas.compat as compat
-from pandas.util.decorators import Appender
+from pandas.util.decorators import Appender, deprecate_kwarg
 import re
 import pandas.lib as lib
 import warnings
@@ -638,6 +638,26 @@ def str_find(arr, sub, start=0, end=None, side='left'):
     return _na_map(f, arr, dtype=int)
 
 
+def str_index(arr, sub, start=0, end=None, side='left'):
+    if not isinstance(sub, compat.string_types):
+        msg = 'expected a string object, not {0}'
+        raise TypeError(msg.format(type(sub).__name__))
+
+    if side == 'left':
+        method = 'index'
+    elif side == 'right':
+        method = 'rindex'
+    else:  # pragma: no cover
+        raise ValueError('Invalid side')
+
+    if end is None:
+        f = lambda x: getattr(x, method)(sub, start)
+    else:
+        f = lambda x: getattr(x, method)(sub, start, end)
+
+    return _na_map(f, arr, dtype=int)
+
+
 def str_pad(arr, width, side='left', fillchar=' '):
     """
     Pad strings in the Series/Index with an additional character to
@@ -676,7 +696,7 @@ def str_pad(arr, width, side='left', fillchar=' '):
     return _na_map(f, arr)
 
 
-def str_split(arr, pat=None, n=None, return_type='series'):
+def str_split(arr, pat=None, n=None):
     """
     Split each string (a la re.split) in the Series/Index by given
     pattern, propagating NA values. Equivalent to :meth:`str.split`.
@@ -685,29 +705,17 @@ def str_split(arr, pat=None, n=None, return_type='series'):
     ----------
     pat : string, default None
         String or regular expression to split on. If None, splits on whitespace
-    n : int, default None (all)
-    return_type : {'series', 'index', 'frame'}, default 'series'
-        If frame, returns a DataFrame (elements are strings)
-        If series or index, returns the same type as the original object
-        (elements are lists of strings).
-
-    Notes
-    -----
-    Both 0 and -1 will be interpreted as return all splits
+    n : int, default -1 (all)
+        None, 0 and -1 will be interpreted as return all splits
+    expand : bool, default False
+        * If True, return DataFrame/MultiIndex expanding dimensionality.
+        * If False, return Series/Index.
+    return_type : deprecated, use `expand`
 
     Returns
     -------
-    split : Series/Index of objects or DataFrame
+    split : Series/Index or DataFrame/MultiIndex of objects
     """
-    from pandas.core.series import Series
-    from pandas.core.frame import DataFrame
-    from pandas.core.index import Index
-
-    if return_type not in ('series', 'index', 'frame'):
-        raise ValueError("return_type must be {'series', 'index', 'frame'}")
-    if return_type == 'frame' and isinstance(arr, Index):
-        raise ValueError("return_type='frame' is not supported for string "
-                         "methods on Index")
     if pat is None:
         if n is None or n == 0:
             n = -1
@@ -722,10 +730,7 @@ def str_split(arr, pat=None, n=None, return_type='series'):
                 n = 0
             regex = re.compile(pat)
             f = lambda x: regex.split(x, maxsplit=n)
-    if return_type == 'frame':
-        res = DataFrame((x for x in _na_map(f, arr)), index=arr.index)
-    else:
-        res = _na_map(f, arr)
+    res = _na_map(f, arr)
     return res
 
 
@@ -808,7 +813,7 @@ def str_strip(arr, to_strip=None, side='both'):
 
 
 def str_wrap(arr, width, **kwargs):
-    """
+    r"""
     Wrap long strings in the Series/Index to be formatted in
     paragraphs with length less than a given width.
 
@@ -870,6 +875,44 @@ def str_wrap(arr, width, **kwargs):
     return _na_map(lambda s: '\n'.join(tw.wrap(s)), arr)
 
 
+def str_translate(arr, table, deletechars=None):
+    """
+    Map all characters in the string through the given mapping table.
+    Equivalent to standard :meth:`str.translate`. Note that the optional
+    argument deletechars is only valid if you are using python 2. For python 3,
+    character deletion should be specified via the table argument.
+
+    Parameters
+    ----------
+    table : dict (python 3), str or None (python 2)
+        In python 3, table is a mapping of Unicode ordinals to Unicode ordinals,
+        strings, or None. Unmapped characters are left untouched. Characters
+        mapped to None are deleted. :meth:`str.maketrans` is a helper function
+        for making translation tables.
+        In python 2, table is either a string of length 256 or None. If the
+        table argument is None, no translation is applied and the operation
+        simply removes the characters in deletechars. :func:`string.maketrans`
+        is a helper function for making translation tables.
+    deletechars : str, optional (python 2)
+        A string of characters to delete. This argument is only valid
+        in python 2.
+
+    Returns
+    -------
+    translated : Series/Index of objects
+    """
+    if deletechars is None:
+        f = lambda x: x.translate(table)
+    else:
+        from pandas import compat
+        if compat.PY3:
+            raise ValueError("deletechars is not a valid argument for "
+                             "str.translate in python 3. You should simply "
+                             "specify character deletions in the table argument")
+        f = lambda x: x.translate(table, deletechars)
+    return _na_map(f, arr)
+
+
 def str_get(arr, i):
     """
     Extract element from lists, tuples, or strings in each element in the
@@ -1001,6 +1044,7 @@ def __iter__(self):
             g = self.get(i)
 
     def _wrap_result(self, result, **kwargs):
+
         # leave as it is to keep extract and get_dummies results
         # can be merged to _wrap_result_expand in v0.17
         from pandas.core.series import Series
@@ -1024,7 +1068,10 @@ def _wrap_result(self, result, **kwargs):
             return DataFrame(result, index=self.series.index)
 
     def _wrap_result_expand(self, result, expand=False):
-        from pandas.core.index import Index
+        if not isinstance(expand, bool):
+            raise ValueError("expand must be True or False")
+
+        from pandas.core.index import Index, MultiIndex
         if not hasattr(result, 'ndim'):
             return result
 
@@ -1037,7 +1084,9 @@ def _wrap_result_expand(self, result, expand=False):
 
             if expand:
                 result = list(result)
-            return Index(result, name=name)
+                return MultiIndex.from_tuples(result, names=name)
+            else:
+                return Index(result, name=name)
         else:
             index = self.series.index
             if expand:
@@ -1054,10 +1103,12 @@ def cat(self, others=None, sep=None, na_rep=None):
         result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep)
         return self._wrap_result(result)
 
+    @deprecate_kwarg('return_type', 'expand',
+                     mapping={'series': False, 'frame': True})
     @copy(str_split)
-    def split(self, pat=None, n=-1, return_type='series'):
-        result = str_split(self.series, pat, n=n, return_type=return_type)
-        return self._wrap_result(result)
+    def split(self, pat=None, n=-1, expand=False):
+        result = str_split(self.series, pat, n=n)
+        return self._wrap_result_expand(result, expand=expand)
 
     _shared_docs['str_partition'] = ("""
     Split the string at the %(side)s occurrence of `sep`, and return 3 elements
@@ -1071,7 +1122,7 @@ def split(self, pat=None, n=-1, return_type='series'):
         String to split on.
     expand : bool, default True
         * If True, return DataFrame/MultiIndex expanding dimensionality.
-        * If False, return Series/Index
+        * If False, return Series/Index.
 
     Returns
     -------
@@ -1261,6 +1312,11 @@ def get_dummies(self, sep='|'):
         result = str_get_dummies(self.series, sep)
         return self._wrap_result(result)
 
+    @copy(str_translate)
+    def translate(self, table, deletechars=None):
+        result = str_translate(self.series, table, deletechars)
+        return self._wrap_result(result)
+
     count = _pat_wrapper(str_count, flags=True)
     startswith = _pat_wrapper(str_startswith, na=True)
     endswith = _pat_wrapper(str_endswith, na=True)
@@ -1325,6 +1381,42 @@ def normalize(self, form):
         result = _na_map(f, self.series)
         return self._wrap_result(result)
 
+    _shared_docs['index'] = ("""
+    Return %(side)s indexes in each strings where the substring is
+    fully contained between [start:end]. This is the same as ``str.%(similar)s``
+    except instead of returning -1, it raises a ValueError when the substring
+    is not found. Equivalent to standard ``str.%(method)s``.
+
+    Parameters
+    ----------
+    sub : str
+        Substring being searched
+    start : int
+        Left edge index
+    end : int
+        Right edge index
+
+    Returns
+    -------
+    found : Series/Index of objects
+
+    See Also
+    --------
+    %(also)s
+    """)
+
+    @Appender(_shared_docs['index'] % dict(side='lowest', similar='find', method='index',
+              also='rindex : Return highest indexes in each strings'))
+    def index(self, sub, start=0, end=None):
+        result = str_index(self.series, sub, start=start, end=end, side='left')
+        return self._wrap_result(result)
+
+    @Appender(_shared_docs['index'] % dict(side='highest', similar='rfind', method='rindex',
+              also='index : Return lowest indexes in each strings'))
+    def rindex(self, sub, start=0, end=None):
+        result = str_index(self.series, sub, start=start, end=end, side='right')
+        return self._wrap_result(result)
+
     _shared_docs['len'] = ("""
     Compute length of each string in the Series/Index.