# This is a combination of 2 commits.

cgevans · cgevans · commit 1e2d44e174a6 · 2015-05-25T15:50:01.000+03:00
# The first commit's message is:

PERF: increase performance of str_split when returning a frame

# The 2nd commit message will be skipped:

#	PERF: new str.split performance improvement, handles NaN
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -3,7 +3,7 @@
 from pandas.compat import zip
 from pandas.core.common import isnull, _values_from_object, is_bool_dtype
 import pandas.compat as compat
-from pandas.util.decorators import Appender, deprecate_kwarg
+from pandas.util.decorators import Appender
 import re
 import pandas.lib as lib
 import warnings
@@ -638,26 +638,6 @@ def str_find(arr, sub, start=0, end=None, side='left'):
     return _na_map(f, arr, dtype=int)
 
 
-def str_index(arr, sub, start=0, end=None, side='left'):
-    if not isinstance(sub, compat.string_types):
-        msg = 'expected a string object, not {0}'
-        raise TypeError(msg.format(type(sub).__name__))
-
-    if side == 'left':
-        method = 'index'
-    elif side == 'right':
-        method = 'rindex'
-    else:  # pragma: no cover
-        raise ValueError('Invalid side')
-
-    if end is None:
-        f = lambda x: getattr(x, method)(sub, start)
-    else:
-        f = lambda x: getattr(x, method)(sub, start, end)
-
-    return _na_map(f, arr, dtype=int)
-
-
 def str_pad(arr, width, side='left', fillchar=' '):
     """
     Pad strings in the Series/Index with an additional character to
@@ -696,7 +676,7 @@ def str_pad(arr, width, side='left', fillchar=' '):
     return _na_map(f, arr)
 
 
-def str_split(arr, pat=None, n=None):
+def str_split(arr, pat=None, n=None, return_type='series'):
     """
     Split each string (a la re.split) in the Series/Index by given
     pattern, propagating NA values. Equivalent to :meth:`str.split`.
@@ -705,17 +685,29 @@ def str_split(arr, pat=None, n=None):
     ----------
     pat : string, default None
         String or regular expression to split on. If None, splits on whitespace
-    n : int, default -1 (all)
-        None, 0 and -1 will be interpreted as return all splits
-    expand : bool, default False
-        * If True, return DataFrame/MultiIndex expanding dimensionality.
-        * If False, return Series/Index.
-    return_type : deprecated, use `expand`
+    n : int, default None (all)
+    return_type : {'series', 'index', 'frame'}, default 'series'
+        If frame, returns a DataFrame (elements are strings)
+        If series or index, returns the same type as the original object
+        (elements are lists of strings).
+
+    Notes
+    -----
+    Both 0 and -1 will be interpreted as return all splits
 
     Returns
     -------
-    split : Series/Index or DataFrame/MultiIndex of objects
+    split : Series/Index of objects or DataFrame
     """
+    from pandas.core.series import Series
+    from pandas.core.frame import DataFrame
+    from pandas.core.index import Index
+
+    if return_type not in ('series', 'index', 'frame'):
+        raise ValueError("return_type must be {'series', 'index', 'frame'}")
+    if return_type == 'frame' and isinstance(arr, Index):
+        raise ValueError("return_type='frame' is not supported for string "
+                         "methods on Index")
     if pat is None:
         if n is None or n == 0:
             n = -1
@@ -730,7 +722,10 @@ def str_split(arr, pat=None, n=None):
                 n = 0
             regex = re.compile(pat)
             f = lambda x: regex.split(x, maxsplit=n)
-    res = _na_map(f, arr)
+    if return_type == 'frame':
+        res = DataFrame((x for x in _na_map(f, arr)), index=arr.index)
+    else:
+        res = _na_map(f, arr)
     return res
 
 
@@ -813,7 +808,7 @@ def str_strip(arr, to_strip=None, side='both'):
 
 
 def str_wrap(arr, width, **kwargs):
-    r"""
+    """
     Wrap long strings in the Series/Index to be formatted in
     paragraphs with length less than a given width.
 
@@ -875,44 +870,6 @@ def str_wrap(arr, width, **kwargs):
     return _na_map(lambda s: '\n'.join(tw.wrap(s)), arr)
 
 
-def str_translate(arr, table, deletechars=None):
-    """
-    Map all characters in the string through the given mapping table.
-    Equivalent to standard :meth:`str.translate`. Note that the optional
-    argument deletechars is only valid if you are using python 2. For python 3,
-    character deletion should be specified via the table argument.
-
-    Parameters
-    ----------
-    table : dict (python 3), str or None (python 2)
-        In python 3, table is a mapping of Unicode ordinals to Unicode ordinals,
-        strings, or None. Unmapped characters are left untouched. Characters
-        mapped to None are deleted. :meth:`str.maketrans` is a helper function
-        for making translation tables.
-        In python 2, table is either a string of length 256 or None. If the
-        table argument is None, no translation is applied and the operation
-        simply removes the characters in deletechars. :func:`string.maketrans`
-        is a helper function for making translation tables.
-    deletechars : str, optional (python 2)
-        A string of characters to delete. This argument is only valid
-        in python 2.
-
-    Returns
-    -------
-    translated : Series/Index of objects
-    """
-    if deletechars is None:
-        f = lambda x: x.translate(table)
-    else:
-        from pandas import compat
-        if compat.PY3:
-            raise ValueError("deletechars is not a valid argument for "
-                             "str.translate in python 3. You should simply "
-                             "specify character deletions in the table argument")
-        f = lambda x: x.translate(table, deletechars)
-    return _na_map(f, arr)
-
-
 def str_get(arr, i):
     """
     Extract element from lists, tuples, or strings in each element in the
@@ -1044,7 +1001,6 @@ def __iter__(self):
             g = self.get(i)
 
     def _wrap_result(self, result, **kwargs):
-
         # leave as it is to keep extract and get_dummies results
         # can be merged to _wrap_result_expand in v0.17
         from pandas.core.series import Series
@@ -1068,10 +1024,7 @@ def _wrap_result(self, result, **kwargs):
             return DataFrame(result, index=self.series.index)
 
     def _wrap_result_expand(self, result, expand=False):
-        if not isinstance(expand, bool):
-            raise ValueError("expand must be True or False")
-
-        from pandas.core.index import Index, MultiIndex
+        from pandas.core.index import Index
         if not hasattr(result, 'ndim'):
             return result
 
@@ -1084,16 +1037,13 @@ def _wrap_result_expand(self, result, expand=False):
 
             if expand:
                 result = list(result)
-                return MultiIndex.from_tuples(result, names=name)
-            else:
-                return Index(result, name=name)
+            return Index(result, name=name)
         else:
             index = self.series.index
             if expand:
-                cons_row = self.series._constructor
                 cons = self.series._constructor_expanddim
-                data = [cons_row(x) for x in result]
-                return cons(data, index=index)
+                data = [x if (x is not np.nan) else [None] for x in result]
+                return cons(data, index=index).fillna(np.nan)
             else:
                 name = getattr(result, 'name', None)
                 cons = self.series._constructor
@@ -1104,12 +1054,10 @@ def cat(self, others=None, sep=None, na_rep=None):
         result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep)
         return self._wrap_result(result)
 
-    @deprecate_kwarg('return_type', 'expand',
-                     mapping={'series': False, 'frame': True})
     @copy(str_split)
-    def split(self, pat=None, n=-1, expand=False):
-        result = str_split(self.series, pat, n=n)
-        return self._wrap_result_expand(result, expand=expand)
+    def split(self, pat=None, n=-1, return_type='series'):
+        result = str_split(self.series, pat, n=n, return_type=return_type)
+        return self._wrap_result(result)
 
     _shared_docs['str_partition'] = ("""
     Split the string at the %(side)s occurrence of `sep`, and return 3 elements
@@ -1123,7 +1071,7 @@ def split(self, pat=None, n=-1, expand=False):
         String to split on.
     expand : bool, default True
         * If True, return DataFrame/MultiIndex expanding dimensionality.
-        * If False, return Series/Index.
+        * If False, return Series/Index
 
     Returns
     -------
@@ -1313,11 +1261,6 @@ def get_dummies(self, sep='|'):
         result = str_get_dummies(self.series, sep)
         return self._wrap_result(result)
 
-    @copy(str_translate)
-    def translate(self, table, deletechars=None):
-        result = str_translate(self.series, table, deletechars)
-        return self._wrap_result(result)
-
     count = _pat_wrapper(str_count, flags=True)
     startswith = _pat_wrapper(str_startswith, na=True)
     endswith = _pat_wrapper(str_endswith, na=True)
@@ -1382,42 +1325,6 @@ def normalize(self, form):
         result = _na_map(f, self.series)
         return self._wrap_result(result)
 
-    _shared_docs['index'] = ("""
-    Return %(side)s indexes in each strings where the substring is
-    fully contained between [start:end]. This is the same as ``str.%(similar)s``
-    except instead of returning -1, it raises a ValueError when the substring
-    is not found. Equivalent to standard ``str.%(method)s``.
-
-    Parameters
-    ----------
-    sub : str
-        Substring being searched
-    start : int
-        Left edge index
-    end : int
-        Right edge index
-
-    Returns
-    -------
-    found : Series/Index of objects
-
-    See Also
-    --------
-    %(also)s
-    """)
-
-    @Appender(_shared_docs['index'] % dict(side='lowest', similar='find', method='index',
-              also='rindex : Return highest indexes in each strings'))
-    def index(self, sub, start=0, end=None):
-        result = str_index(self.series, sub, start=start, end=end, side='left')
-        return self._wrap_result(result)
-
-    @Appender(_shared_docs['index'] % dict(side='highest', similar='rfind', method='rindex',
-              also='index : Return lowest indexes in each strings'))
-    def rindex(self, sub, start=0, end=None):
-        result = str_index(self.series, sub, start=start, end=end, side='right')
-        return self._wrap_result(result)
-
     _shared_docs['len'] = ("""
     Compute length of each string in the Series/Index.