ENH: Implementing str_wrap function

jeffreystarr · jreback · commit 5b838ab44384 · 2014-04-29T17:02:45.000-04:00
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -1224,6 +1224,7 @@ Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take
     ``repeat``,Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``)
     ``pad``,"Add whitespace to left, right, or both sides of strings"
     ``center``,Equivalent to ``pad(side='both')``
+    ``wrap``,Split long strings into lines with length less than a given width
     ``slice``,Slice each string in the Series
     ``slice_replace``,Replace slice in each string with passed value
     ``count``,Count occurrences of pattern
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -299,6 +299,7 @@ Improvements to existing features
 - Refactor Block classes removing `Block.items` attributes to avoid duplication
   in item handling (:issue:`6745`, :issue:`6988`).
 - Improve performance in certain reindexing operations by optimizing ``take_2d`` (:issue:`6749`)
+- Arrays of strings can be wrapped to a specified width (``str.wrap``) (:issue:`6999`)
 
 .. _release.bug_fixes-0.14.0:
 
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -537,6 +537,8 @@ Enhancements
                 columns=Grouper(freq='M', key='PayDay'),
                 values='Quantity', aggfunc=np.sum)
 
+- str.wrap implemented (:issue:`6999`)
+
 Performance
 ~~~~~~~~~~~
 
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -8,6 +8,7 @@
 import re
 import pandas.lib as lib
 import warnings
+import textwrap
 
 
 def _get_array_list(arr, others):
@@ -717,20 +718,63 @@ def str_rstrip(arr, to_strip=None):
     return _na_map(lambda x: x.rstrip(to_strip), arr)
 
 
-def str_wrap(arr, width=80):
+def str_wrap(arr, width, **kwargs):
     """
     Wrap long strings to be formatted in paragraphs
 
     Parameters
     ----------
+    Same keyword parameters and defaults as :class:`textwrap.TextWrapper`
     width : int
         Maximum line-width
+    expand_tabs : bool, optional
+        If true, tab characters will be expanded to spaces (default: True)
+    replace_whitespace : bool, optional
+        If true, each whitespace character (as defined by string.whitespace) remaining
+        after tab expansion will be replaced by a single space (default: True)
+    drop_whitespace : bool, optional
+        If true, whitespace that, after wrapping, happens to end up at the beginning
+        or end of a line is dropped (default: True)
+    break_long_words : bool, optional
+        If true, then words longer than width will be broken in order to ensure that
+        no lines are longer than width. If it is false, long words will not be broken,
+        and some lines may be longer than width. (default: True)
+    break_on_hyphens : bool, optional
+        If true, wrapping will occur preferably on whitespace and right after hyphens
+        in compound words, as it is customary in English. If false, only whitespaces
+        will be considered as potentially good places for line breaks, but you need
+        to set break_long_words to false if you want truly insecable words.
+        (default: True)
 
     Returns
     -------
     wrapped : array
+
+    Notes
+    -----
+    Internally, this method uses a :class:`textwrap.TextWrapper` instance with default
+    settings. To achieve behavior matching R's stringr library str_wrap function, use
+    the arguments:
+
+        expand_tabs = False
+        replace_whitespace = True
+        drop_whitespace = True
+        break_long_words = False
+        break_on_hyphens = False
+
+    Examples
+    --------
+
+    >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped'])
+    >>> s.str.wrap(12)
+    0             line to be\nwrapped
+    1    another line\nto be\nwrapped
     """
-    raise NotImplementedError
+    kwargs['width'] = width
+
+    tw = textwrap.TextWrapper(**kwargs)
+
+    return _na_map(lambda s: '\n'.join(tw.wrap(s)), arr)
 
 
 def str_get(arr, i):
@@ -955,6 +999,11 @@ def rstrip(self, to_strip=None):
         result = str_rstrip(self.series, to_strip)
         return self._wrap_result(result)
 
+    @copy(str_wrap)
+    def wrap(self, width, **kwargs):
+        result = str_wrap(self.series, width, **kwargs)
+        return self._wrap_result(result)
+
     @copy(str_get_dummies)
     def get_dummies(self, sep='|'):
         result = str_get_dummies(self.series, sep)
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -947,7 +947,31 @@ def test_strip_lstrip_rstrip_args_unicode(self):
         assert_series_equal(rs, xp)
 
     def test_wrap(self):
-        pass
+        # test values are: two words less than width, two words equal to width,
+        # two words greater than width, one word less than width, one word
+        # equal to width, one word greater than width, multiple tokens with trailing
+        # whitespace equal to width
+        values = Series([u('hello world'), u('hello world!'),
+                         u('hello world!!'), u('abcdefabcde'),
+                         u('abcdefabcdef'), u('abcdefabcdefa'),
+                         u('ab ab ab ab '), u('ab ab ab ab a'),
+                         u('\t')])
+
+        # expected values
+        xp = Series([u('hello world'), u('hello world!'),
+                     u('hello\nworld!!'), u('abcdefabcde'),
+                     u('abcdefabcdef'), u('abcdefabcdef\na'),
+                     u('ab ab ab ab'), u('ab ab ab ab\na'),
+                     u('')])
+
+        rs = values.str.wrap(12, break_long_words=True)
+        assert_series_equal(rs, xp)
+
+        # test with pre and post whitespace (non-unicode), NaN, and non-ascii Unicode
+        values = Series(['  pre  ', np.nan, u('\xac\u20ac\U00008000 abadcafe')])
+        xp = Series(['  pre', NA, u('\xac\u20ac\U00008000 ab\nadcafe')])
+        rs = values.str.wrap(6)
+        assert_series_equal(rs, xp)
 
     def test_get(self):
         values = Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'])