pandas-dev · jreback · May 19, 2013 · May 17, 2013
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -43,6 +43,7 @@ pandas 0.11.1
     multi-index column.  
     Note: The default value will change in 0.12 to make the default *to* write and
     read multi-index columns in the new format. (GH3571_, GH1651_, GH3141_)
+  - Add iterator to ``Series.str`` (GH3638_)
 
 **Improvements to existing features**
 
@@ -199,7 +200,7 @@ pandas 0.11.1
 .. _GH3571: https://github.com/pydata/pandas/issues/3571
 .. _GH1651: https://github.com/pydata/pandas/issues/1651
 .. _GH3141: https://github.com/pydata/pandas/issues/3141
-
+.. _GH3638: https://github.com/pydata/pandas/issues/3638
 
 pandas 0.11.0
 =============

diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt
@@ -80,6 +80,27 @@ Enhancements
   - ``DataFrame.replace()`` now allows regular expressions on contained
     ``Series`` with object dtype. See the examples section in the regular docs
     :ref:`Replacing via String Expression <missing_data.replace_expression>`
+  - ``Series.str`` now supports iteration (GH3638_). You can iterate over the
+    individual elements of each string in the ``Series``. Each iteration yields
+    yields a ``Series`` with either a single character at each index of the
+    original ``Series`` or ``NaN``. For example,
+
+    .. ipython:: python
+
+        strs = 'go', 'bow', 'joe', 'slow'
+        ds = Series(strs)
+
+        for s in ds.str:
+            print s
+
+        s
+        s.dropna().values.item() == 'w'
+
+    The last element yielded by the iterator will be a ``Series`` containing
+    the last element of the longest string in the ``Series`` with all other
+    elements being ``NaN``. Here since ``'wikitravel'`` is the longest string
+    and there are no other strings with the same length ``'l'`` is the only
+    non-null string in the yielded ``Series``.
 
   - Multi-index column support for reading and writing csvs
 
@@ -133,3 +154,4 @@ on GitHub for a complete list.
 .. _GH3571: https://github.com/pydata/pandas/issues/3571
 .. _GH1651: https://github.com/pydata/pandas/issues/1651
 .. _GH3141: https://github.com/pydata/pandas/issues/3141
+.. _GH3638: https://github.com/pydata/pandas/issues/3638
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -661,6 +661,14 @@ def __getitem__(self, key):
         else:
             return self.get(key)
 
+    def __iter__(self):
+        i = 0
+        g = self.get(i)
+        while g.notnull().any():
+            yield g
+            i += 1
+            g = self.get(i)
+
     def _wrap_result(self, result):
         return Series(result, index=self.series.index,
                       name=self.series.name)

diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -10,6 +10,8 @@
 
 from numpy import nan as NA
 import numpy as np
+from numpy.testing import assert_array_equal
+from numpy.random import randint
 
 from pandas import (Index, Series, TimeSeries, DataFrame, isnull, notnull,
                     bdate_range, date_range)
@@ -25,6 +27,82 @@ class TestStringMethods(unittest.TestCase):
 
     _multiprocess_can_split_ = True
 
+    def test_iter(self):
+        # GH3638
+        strs = 'google', 'wikimedia', 'wikipedia', 'wikitravel'
+        ds = Series(strs)
+
+        for s in ds.str:
+            # iter must yield a Series
+            self.assert_(isinstance(s, Series))
+
+            # indices of each yielded Series should be equal to the index of
+            # the original Series
+            assert_array_equal(s.index, ds.index)
+
+            for el in s:
+                # each element of the series is either a basestring or nan
+                self.assert_(isinstance(el, basestring) or isnull(el))
+
+        # desired behavior is to iterate until everything would be nan on the
+        # next iter so make sure the last element of the iterator was 'l' in
+        # this case since 'wikitravel' is the longest string
+        self.assertEqual(s.dropna().values.item(), 'l')
+
+    def test_iter_empty(self):
+        ds = Series([], dtype=object)
+
+        i, s = 100, 1
+
+        for i, s in enumerate(ds.str):
+            pass
+
+        # nothing to iterate over so nothing defined values should remain
+        # unchanged
+        self.assertEqual(i, 100)
+        self.assertEqual(s, 1)
+
+    def test_iter_single_element(self):
+        ds = Series(['a'])
+
+        for i, s in enumerate(ds.str):
+            pass
+
+        self.assertFalse(i)
+        assert_series_equal(ds, s)
+
+    def test_iter_numeric_try_string(self):
+        # behavior identical to empty series
+        dsi = Series(range(4))
+
+        i, s = 100, 'h'
+
+        for i, s in enumerate(dsi.str):
+            pass
+
+        self.assertEqual(i, 100)
+        self.assertEqual(s, 'h')
+
+        dsf = Series(np.arange(4.))
+
+        for i, s in enumerate(dsf.str):
+            pass
+
+        self.assertEqual(i, 100)
+        self.assertEqual(s, 'h')
+
+    def test_iter_object_try_string(self):
+        ds = Series([slice(None, randint(10), randint(10, 20))
+                     for _ in xrange(4)])
+
+        i, s = 100, 'h'
+
+        for i, s in enumerate(ds.str):
+            pass
+
+        self.assertEqual(i, 100)
+        self.assertEqual(s, 'h')
+
     def test_cat(self):
         one = ['a', 'a', 'b', 'b', 'c', NA]
         two = ['a', NA, 'b', 'd', 'foo', NA]