ENH: Add StringMethods.partition and rpartition

sinhrks · sinhrks · commit ea0757f49e05 · 2015-04-01T22:32:56.000+09:00
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -539,10 +539,12 @@ strings and apply several methods to it. These can be acccessed like
    Series.str.lstrip
    Series.str.match
    Series.str.pad
+   Series.str.partition
    Series.str.repeat
    Series.str.replace
    Series.str.rfind
    Series.str.rjust
+   Series.str.rpartition
    Series.str.rstrip
    Series.str.slice
    Series.str.slice_replace
diff --git a/doc/source/text.rst b/doc/source/text.rst
@@ -229,6 +229,8 @@ Method Summary
     :meth:`~Series.str.strip`,Equivalent to ``str.strip``
     :meth:`~Series.str.rstrip`,Equivalent to ``str.rstrip``
     :meth:`~Series.str.lstrip`,Equivalent to ``str.lstrip``
+    :meth:`~Series.str.partition`,Equivalent to ``str.partition``
+    :meth:`~Series.str.rpartition`,Equivalent to ``str.rpartition``
     :meth:`~Series.str.lower`,Equivalent to ``str.lower``
     :meth:`~Series.str.upper`,Equivalent to ``str.upper``
     :meth:`~Series.str.find`,Equivalent to ``str.find``
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -19,7 +19,7 @@ Enhancements
 
 
 
-
+- Added ``StringMethods.partition()`` and ``rpartition()`` which behave as the same as standard ``str`` (:issue:`xxxx`)
 
 
 .. _whatsnew_0161.api:
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -622,6 +622,19 @@ def str_pad(arr, width, side='left', fillchar=' '):
     return _na_map(f, arr)
 
 
+def _return_type_wrapper(f, arr, return_type):
+    if return_type not in ('series', 'frame'):
+        raise ValueError("return_type must be {'series', 'frame'}")
+
+    if return_type == 'frame':
+       from pandas.core.frame import DataFrame
+       from pandas.core.series import Series
+       return DataFrame((Series(x) for x in _na_map(f, arr)), index=arr.index)
+
+    else:
+        return _na_map(f, arr)
+
+
 def str_split(arr, pat=None, n=None, return_type='series'):
     """
     Split each string (a la re.split) in array by given pattern, propagating NA
@@ -644,11 +657,6 @@ def str_split(arr, pat=None, n=None, return_type='series'):
     -------
     split : array
     """
-    from pandas.core.series import Series
-    from pandas.core.frame import DataFrame
-
-    if return_type not in ('series', 'frame'):
-        raise ValueError("return_type must be {'series', 'frame'}")
     if pat is None:
         if n is None or n == 0:
             n = -1
@@ -663,11 +671,7 @@ def str_split(arr, pat=None, n=None, return_type='series'):
                 n = 0
             regex = re.compile(pat)
             f = lambda x: regex.split(x, maxsplit=n)
-    if return_type == 'frame':
-        res = DataFrame((Series(x) for x in _na_map(f, arr)), index=arr.index)
-    else:
-        res = _na_map(f, arr)
-    return res
+    return _return_type_wrapper(f, arr, return_type)
 
 
 def str_slice(arr, start=None, stop=None, step=None):
@@ -978,6 +982,37 @@ def split(self, pat=None, n=-1, return_type='series'):
         result = str_split(self.series, pat, n=n, return_type=return_type)
         return self._wrap_result(result)
 
+    _shared_docs['str_partition'] = ("""
+    Split the string at the %(side)s occurrence of sep, and return a 3-tuple containing the part
+    before the separator, the separator itself, and the part after the separator.
+    If the separator is not found, return %(return)s.
+
+    Parameters
+    ----------
+    pat : string, default whitespace
+        String to split on.
+    return_type : {'series', 'frame'}, default 'series
+        If frame, returns a DataFrame (elements are strings)
+        If series, returns an Series (elements are lists of strings).
+
+    Returns
+    -------
+    split : array
+    """)
+    @Appender(_shared_docs['str_partition'] % {'side': 'first',
+        'return': 'a 3-tuple containing the string itself, followed by two empty strings'})
+    def partition(self, pat=' ', return_type='series'):
+        f = lambda x: x.partition(pat)
+        result = _return_type_wrapper(f, self.series, return_type)
+        return self._wrap_result(result)
+
+    @Appender(_shared_docs['str_partition'] % {'side': 'last',
+        'return': 'a 3-tuple containing two empty strings, followed by the string itself'})
+    def rpartition(self, pat=' ', return_type='series'):
+        f = lambda x: x.rpartition(pat)
+        result = _return_type_wrapper(f, self.series, return_type)
+        return self._wrap_result(result)
+
     @copy(str_get)
     def get(self, i):
         result = str_get(self.series, i)
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -617,6 +617,8 @@ def test_empty_str_methods(self):
         tm.assert_series_equal(empty_str, empty.str.pad(42))
         tm.assert_series_equal(empty_str, empty.str.center(42))
         tm.assert_series_equal(empty_list, empty.str.split('a'))
+        tm.assert_series_equal(empty_list, empty.str.partition('a'))
+        tm.assert_series_equal(empty_list, empty.str.rpartition('a'))
         tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
         tm.assert_series_equal(empty_str, empty.str.slice(step=1))
         tm.assert_series_equal(empty_str, empty.str.strip())
@@ -1125,6 +1127,82 @@ def test_split_to_dataframe(self):
         with tm.assertRaisesRegexp(ValueError, "return_type must be"):
             s.str.split('_', return_type="some_invalid_type")
 
+    def test_partition(self):
+        values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
+
+        result = values.str.partition('_')
+        exp = Series([['a', '_', 'b_c'], ['c', '_', 'd_e'], NA, ['f', '_', 'g_h']])
+        tm.assert_series_equal(result, exp)
+
+        result = values.str.rpartition('_')
+        exp = Series([['a_b', '_', 'c'], ['c_d', '_', 'e'], NA, ['f_g', '_', 'h']])
+        tm.assert_series_equal(result, exp)
+
+        # more than one char
+        values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
+        result = values.str.partition('__')
+        exp = Series([['a', '__', 'b__c'], ['c', '__', 'd__e'], NA, ['f', '__', 'g__h']])
+        tm.assert_series_equal(result, exp)
+
+        result = values.str.rpartition('__')
+        exp = Series([['a__b', '__', 'c'], ['c__d', '__', 'e'], NA, ['f__g', '__', 'h']])
+        tm.assert_series_equal(result, exp)
+
+        # None
+        values = Series(['a b c', 'c d e', NA, 'f g h'])
+        result = values.str.partition()
+        exp = Series([['a', ' ', 'b c'], ['c', ' ', 'd e'], NA, ['f', ' ', 'g h']])
+        tm.assert_series_equal(result, exp)
+
+        result = values.str.rpartition()
+        exp = Series([['a b', ' ', 'c'], ['c d', ' ', 'e'], NA, ['f g', ' ', 'h']])
+        tm.assert_series_equal(result, exp)
+
+        # Not splited
+        values = Series(['abc', 'cde', NA, 'fgh'])
+        result = values.str.partition('_')
+        exp = Series([['abc', '', ''], ['cde', '', ''], NA, ['fgh', '', '']])
+        tm.assert_series_equal(result, exp)
+
+        result = values.str.rpartition('_')
+        exp = Series([['', '', 'abc'], ['', '', 'cde'], NA, ['', '', 'fgh']])
+        tm.assert_series_equal(result, exp)
+
+        # unicode
+        values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')])
+
+        result = values.str.partition('_')
+        exp = Series([[u('a'), u('_'), u('b_c')], [u('c'), u('_'), u('d_e')],
+                      NA, [u('f'), u('_'), u('g_h')]])
+        tm.assert_series_equal(result, exp)
+
+        result = values.str.rpartition('_')
+        exp = Series([[u('a_b'), u('_'), u('c')], [u('c_d'), u('_'), u('e')],
+                      NA, [u('f_g'), u('_'), u('h')]])
+        tm.assert_series_equal(result, exp)
+
+        # compare to standard lib
+        values = Series(['A_B_C', 'B_C_D', 'E_F_G', 'EFGHEF'])
+        self.assertEqual(values.str.partition('_').tolist(), [v.partition('_') for v in values])
+        self.assertEqual(values.str.rpartition('_').tolist(), [v.rpartition('_') for v in values])
+
+    def test_partition_to_dataframe(self):
+        values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
+        result = values.str.partition('_', return_type='frame')
+        exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
+                         1: ['_', '_', np.nan, '_'],
+                         2: ['b_c', 'd_e', np.nan, 'g_h']})
+        tm.assert_frame_equal(result, exp)
+
+        result = values.str.rpartition('_', return_type='frame')
+        exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
+                         1: ['_', '_', np.nan, '_'],
+                         2: ['c', 'e', np.nan, 'h']})
+        tm.assert_frame_equal(result, exp)
+
+        with tm.assertRaisesRegexp(ValueError, "return_type must be"):
+            values.str.partition('_', return_type="some_invalid_type")
+
     def test_pipe_failures(self):
         # #2119
         s = Series(['A|B|C'])

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ Enhancements`
`19`	`19`
`20`	`20`
`21`	`21`
`22`		`-`
	`22`	+- Added ``StringMethods.partition()`` and ``rpartition()`` which behave as the same as standard ``str`` (:issue:`xxxx`)
`23`	`23`
`24`	`24`
`25`	`25`	`.. _whatsnew_0161.api:`