Merge pull request #9773 from sinhrks/partition

jorisvandenbossche · jorisvandenbossche · commit 45f69cd37cf3 · 2015-05-07T13:08:58.000+02:00
ENH: Add StringMethods.partition and rpartition
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -544,10 +544,12 @@ strings and apply several methods to it. These can be acccessed like
    Series.str.match
    Series.str.normalize
    Series.str.pad
+   Series.str.partition
    Series.str.repeat
    Series.str.replace
    Series.str.rfind
    Series.str.rjust
+   Series.str.rpartition
    Series.str.rstrip
    Series.str.slice
    Series.str.slice_replace
diff --git a/doc/source/text.rst b/doc/source/text.rst
@@ -262,6 +262,8 @@ Method Summary
     :meth:`~Series.str.strip`,Equivalent to ``str.strip``
     :meth:`~Series.str.rstrip`,Equivalent to ``str.rstrip``
     :meth:`~Series.str.lstrip`,Equivalent to ``str.lstrip``
+    :meth:`~Series.str.partition`,Equivalent to ``str.partition``
+    :meth:`~Series.str.rpartition`,Equivalent to ``str.rpartition``
     :meth:`~Series.str.lower`,Equivalent to ``str.lower``
     :meth:`~Series.str.upper`,Equivalent to ``str.upper``
     :meth:`~Series.str.find`,Equivalent to ``str.find``
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -42,6 +42,7 @@ Enhancements
 - Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`)
 - Added ``StringMethods.normalize()`` which behaves the same as standard :func:`unicodedata.normalizes` (:issue:`10031`)
 
+- Added ``StringMethods.partition()`` and ``rpartition()`` which behave as the same as standard ``str`` (:issue:`9773`)
 - Allow clip, clip_lower, and clip_upper to accept array-like arguments as thresholds (:issue:`6966`). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s).
 
   The ``.str`` accessor is now available for both ``Series`` and ``Index``.
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -992,6 +992,8 @@ def __iter__(self):
             g = self.get(i)
 
     def _wrap_result(self, result):
+        # leave as it is to keep extract and get_dummies results
+        # can be merged to _wrap_result_expand in v0.17
         from pandas.core.series import Series
         from pandas.core.frame import DataFrame
         from pandas.core.index import Index
@@ -1012,6 +1014,33 @@ def _wrap_result(self, result):
             assert result.ndim < 3
             return DataFrame(result, index=self.series.index)
 
+    def _wrap_result_expand(self, result, expand=False):
+        from pandas.core.index import Index
+        if not hasattr(result, 'ndim'):
+            return result
+
+        if isinstance(self.series, Index):
+            name = getattr(result, 'name', None)
+            # if result is a boolean np.array, return the np.array
+            # instead of wrapping it into a boolean Index (GH 8875)
+            if hasattr(result, 'dtype') and is_bool_dtype(result):
+                return result
+
+            if expand:
+                result = list(result)
+            return Index(result, name=name)
+        else:
+            index = self.series.index
+            if expand:
+                cons_row = self.series._constructor
+                cons = self.series._constructor_expanddim
+                data = [cons_row(x) for x in result]
+                return cons(data, index=index)
+            else:
+                name = getattr(result, 'name', None)
+                cons = self.series._constructor
+                return cons(result, name=name, index=index)
+
     @copy(str_cat)
     def cat(self, others=None, sep=None, na_rep=None):
         result = str_cat(self.series, others=others, sep=sep, na_rep=na_rep)
@@ -1022,6 +1051,65 @@ def split(self, pat=None, n=-1, return_type='series'):
         result = str_split(self.series, pat, n=n, return_type=return_type)
         return self._wrap_result(result)
 
+    _shared_docs['str_partition'] = ("""
+    Split the string at the %(side)s occurrence of `sep`, and return 3 elements
+    containing the part before the separator, the separator itself,
+    and the part after the separator.
+    If the separator is not found, return %(return)s.
+
+    Parameters
+    ----------
+    pat : string, default whitespace
+        String to split on.
+    expand : bool, default True
+        * If True, return DataFrame/MultiIndex expanding dimensionality.
+        * If False, return Series/Index
+
+    Returns
+    -------
+    split : DataFrame/MultiIndex or Series/Index of objects
+
+    See Also
+    --------
+    %(also)s
+
+    Examples
+    --------
+
+    >>> s = Series(['A_B_C', 'D_E_F', 'X'])
+    0    A_B_C
+    1    D_E_F
+    2        X
+    dtype: object
+
+    >>> s.str.partition('_')
+       0  1    2
+    0  A  _  B_C
+    1  D  _  E_F
+    2  X
+
+    >>> s.str.rpartition('_')
+         0  1  2
+    0  A_B  _  C
+    1  D_E  _  F
+    2          X
+    """)
+    @Appender(_shared_docs['str_partition'] % {'side': 'first',
+        'return': '3 elements containing the string itself, followed by two empty strings',
+        'also': 'rpartition : Split the string at the last occurrence of `sep`'})
+    def partition(self, pat=' ', expand=True):
+        f = lambda x: x.partition(pat)
+        result = _na_map(f, self.series)
+        return self._wrap_result_expand(result, expand=expand)
+
+    @Appender(_shared_docs['str_partition'] % {'side': 'last',
+        'return': '3 elements containing two empty strings, followed by the string itself',
+        'also': 'partition : Split the string at the first occurrence of `sep`'})
+    def rpartition(self, pat=' ', expand=True):
+        f = lambda x: x.rpartition(pat)
+        result = _na_map(f, self.series)
+        return self._wrap_result_expand(result, expand=expand)
+
     @copy(str_get)
     def get(self, i):
         result = str_get(self.series, i)
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -664,6 +664,8 @@ def test_empty_str_methods(self):
         tm.assert_series_equal(empty_str, empty.str.pad(42))
         tm.assert_series_equal(empty_str, empty.str.center(42))
         tm.assert_series_equal(empty_list, empty.str.split('a'))
+        tm.assert_series_equal(empty_list, empty.str.partition('a', expand=False))
+        tm.assert_series_equal(empty_list, empty.str.rpartition('a', expand=False))
         tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
         tm.assert_series_equal(empty_str, empty.str.slice(step=1))
         tm.assert_series_equal(empty_str, empty.str.strip())
@@ -687,6 +689,12 @@ def test_empty_str_methods(self):
         tm.assert_series_equal(empty_str, empty.str.swapcase())
         tm.assert_series_equal(empty_str, empty.str.normalize('NFC'))
 
+    def test_empty_str_methods_to_frame(self):
+        empty_str = empty = Series(dtype=str)
+        empty_df = DataFrame([])
+        tm.assert_frame_equal(empty_df, empty.str.partition('a'))
+        tm.assert_frame_equal(empty_df, empty.str.rpartition('a'))
+
     def test_ismethods(self):
         values = ['A', 'b', 'Xy', '4', '3A', '', 'TT', '55', '-', '  ']
         str_s = Series(values)
@@ -1175,6 +1183,119 @@ def test_split_to_dataframe(self):
         with tm.assertRaisesRegexp(ValueError, "return_type must be"):
             s.str.split('_', return_type="some_invalid_type")
 
+    def test_partition_series(self):
+        values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
+
+        result = values.str.partition('_', expand=False)
+        exp = Series([['a', '_', 'b_c'], ['c', '_', 'd_e'], NA, ['f', '_', 'g_h']])
+        tm.assert_series_equal(result, exp)
+
+        result = values.str.rpartition('_', expand=False)
+        exp = Series([['a_b', '_', 'c'], ['c_d', '_', 'e'], NA, ['f_g', '_', 'h']])
+        tm.assert_series_equal(result, exp)
+
+        # more than one char
+        values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
+        result = values.str.partition('__', expand=False)
+        exp = Series([['a', '__', 'b__c'], ['c', '__', 'd__e'], NA, ['f', '__', 'g__h']])
+        tm.assert_series_equal(result, exp)
+
+        result = values.str.rpartition('__', expand=False)
+        exp = Series([['a__b', '__', 'c'], ['c__d', '__', 'e'], NA, ['f__g', '__', 'h']])
+        tm.assert_series_equal(result, exp)
+
+        # None
+        values = Series(['a b c', 'c d e', NA, 'f g h'])
+        result = values.str.partition(expand=False)
+        exp = Series([['a', ' ', 'b c'], ['c', ' ', 'd e'], NA, ['f', ' ', 'g h']])
+        tm.assert_series_equal(result, exp)
+
+        result = values.str.rpartition(expand=False)
+        exp = Series([['a b', ' ', 'c'], ['c d', ' ', 'e'], NA, ['f g', ' ', 'h']])
+        tm.assert_series_equal(result, exp)
+
+        # Not splited
+        values = Series(['abc', 'cde', NA, 'fgh'])
+        result = values.str.partition('_', expand=False)
+        exp = Series([['abc', '', ''], ['cde', '', ''], NA, ['fgh', '', '']])
+        tm.assert_series_equal(result, exp)
+
+        result = values.str.rpartition('_', expand=False)
+        exp = Series([['', '', 'abc'], ['', '', 'cde'], NA, ['', '', 'fgh']])
+        tm.assert_series_equal(result, exp)
+
+        # unicode
+        values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')])
+
+        result = values.str.partition('_', expand=False)
+        exp = Series([[u('a'), u('_'), u('b_c')], [u('c'), u('_'), u('d_e')],
+                      NA, [u('f'), u('_'), u('g_h')]])
+        tm.assert_series_equal(result, exp)
+
+        result = values.str.rpartition('_', expand=False)
+        exp = Series([[u('a_b'), u('_'), u('c')], [u('c_d'), u('_'), u('e')],
+                      NA, [u('f_g'), u('_'), u('h')]])
+        tm.assert_series_equal(result, exp)
+
+        # compare to standard lib
+        values = Series(['A_B_C', 'B_C_D', 'E_F_G', 'EFGHEF'])
+        result = values.str.partition('_', expand=False).tolist()
+        self.assertEqual(result, [v.partition('_') for v in values])
+        result = values.str.rpartition('_', expand=False).tolist()
+        self.assertEqual(result, [v.rpartition('_') for v in values])
+
+    def test_partition_index(self):
+        values = Index(['a_b_c', 'c_d_e', 'f_g_h'])
+
+        result = values.str.partition('_', expand=False)
+        exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')]))
+        tm.assert_index_equal(result, exp)
+        self.assertEqual(result.nlevels, 1)
+
+        result = values.str.rpartition('_', expand=False)
+        exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')]))
+        tm.assert_index_equal(result, exp)
+        self.assertEqual(result.nlevels, 1)
+
+        result = values.str.partition('_')
+        exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')])
+        tm.assert_index_equal(result, exp)
+        self.assertTrue(isinstance(result, MultiIndex))
+        self.assertEqual(result.nlevels, 3)
+
+        result = values.str.rpartition('_')
+        exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')])
+        tm.assert_index_equal(result, exp)
+        self.assertTrue(isinstance(result, MultiIndex))
+        self.assertEqual(result.nlevels, 3)
+
+    def test_partition_to_dataframe(self):
+        values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
+        result = values.str.partition('_')
+        exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
+                         1: ['_', '_', np.nan, '_'],
+                         2: ['b_c', 'd_e', np.nan, 'g_h']})
+        tm.assert_frame_equal(result, exp)
+
+        result = values.str.rpartition('_')
+        exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
+                         1: ['_', '_', np.nan, '_'],
+                         2: ['c', 'e', np.nan, 'h']})
+        tm.assert_frame_equal(result, exp)
+
+        values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
+        result = values.str.partition('_', expand=True)
+        exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
+                         1: ['_', '_', np.nan, '_'],
+                         2: ['b_c', 'd_e', np.nan, 'g_h']})
+        tm.assert_frame_equal(result, exp)
+
+        result = values.str.rpartition('_', expand=True)
+        exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
+                         1: ['_', '_', np.nan, '_'],
+                         2: ['c', 'e', np.nan, 'h']})
+        tm.assert_frame_equal(result, exp)
+
     def test_pipe_failures(self):
         # #2119
         s = Series(['A|B|C'])