diff --git a/RELEASE.rst b/RELEASE.rst index 4e92ecb24574a..f97708de13442 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -43,6 +43,7 @@ pandas 0.11.1 multi-index column. Note: The default value will change in 0.12 to make the default *to* write and read multi-index columns in the new format. (GH3571_, GH1651_, GH3141_) + - Add iterator to ``Series.str`` (GH3638_) **Improvements to existing features** @@ -199,7 +200,7 @@ pandas 0.11.1 .. _GH3571: https://github.com/pydata/pandas/issues/3571 .. _GH1651: https://github.com/pydata/pandas/issues/1651 .. _GH3141: https://github.com/pydata/pandas/issues/3141 - +.. _GH3638: https://github.com/pydata/pandas/issues/3638 pandas 0.11.0 ============= diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index a724ce96a7381..e9861301231d8 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -80,6 +80,27 @@ Enhancements - ``DataFrame.replace()`` now allows regular expressions on contained ``Series`` with object dtype. See the examples section in the regular docs :ref:`Replacing via String Expression ` + - ``Series.str`` now supports iteration (GH3638_). You can iterate over the + individual elements of each string in the ``Series``. Each iteration yields + yields a ``Series`` with either a single character at each index of the + original ``Series`` or ``NaN``. For example, + + .. ipython:: python + + strs = 'go', 'bow', 'joe', 'slow' + ds = Series(strs) + + for s in ds.str: + print s + + s + s.dropna().values.item() == 'w' + + The last element yielded by the iterator will be a ``Series`` containing + the last element of the longest string in the ``Series`` with all other + elements being ``NaN``. Here since ``'wikitravel'`` is the longest string + and there are no other strings with the same length ``'l'`` is the only + non-null string in the yielded ``Series``. - Multi-index column support for reading and writing csvs @@ -133,3 +154,4 @@ on GitHub for a complete list. .. _GH3571: https://github.com/pydata/pandas/issues/3571 .. _GH1651: https://github.com/pydata/pandas/issues/1651 .. _GH3141: https://github.com/pydata/pandas/issues/3141 +.. _GH3638: https://github.com/pydata/pandas/issues/3638 diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 3521c9ff94b11..13e2b3b0a4cab 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -661,6 +661,14 @@ def __getitem__(self, key): else: return self.get(key) + def __iter__(self): + i = 0 + g = self.get(i) + while g.notnull().any(): + yield g + i += 1 + g = self.get(i) + def _wrap_result(self, result): return Series(result, index=self.series.index, name=self.series.name) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 2134eea186649..0eac88419f5e3 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -10,6 +10,8 @@ from numpy import nan as NA import numpy as np +from numpy.testing import assert_array_equal +from numpy.random import randint from pandas import (Index, Series, TimeSeries, DataFrame, isnull, notnull, bdate_range, date_range) @@ -25,6 +27,82 @@ class TestStringMethods(unittest.TestCase): _multiprocess_can_split_ = True + def test_iter(self): + # GH3638 + strs = 'google', 'wikimedia', 'wikipedia', 'wikitravel' + ds = Series(strs) + + for s in ds.str: + # iter must yield a Series + self.assert_(isinstance(s, Series)) + + # indices of each yielded Series should be equal to the index of + # the original Series + assert_array_equal(s.index, ds.index) + + for el in s: + # each element of the series is either a basestring or nan + self.assert_(isinstance(el, basestring) or isnull(el)) + + # desired behavior is to iterate until everything would be nan on the + # next iter so make sure the last element of the iterator was 'l' in + # this case since 'wikitravel' is the longest string + self.assertEqual(s.dropna().values.item(), 'l') + + def test_iter_empty(self): + ds = Series([], dtype=object) + + i, s = 100, 1 + + for i, s in enumerate(ds.str): + pass + + # nothing to iterate over so nothing defined values should remain + # unchanged + self.assertEqual(i, 100) + self.assertEqual(s, 1) + + def test_iter_single_element(self): + ds = Series(['a']) + + for i, s in enumerate(ds.str): + pass + + self.assertFalse(i) + assert_series_equal(ds, s) + + def test_iter_numeric_try_string(self): + # behavior identical to empty series + dsi = Series(range(4)) + + i, s = 100, 'h' + + for i, s in enumerate(dsi.str): + pass + + self.assertEqual(i, 100) + self.assertEqual(s, 'h') + + dsf = Series(np.arange(4.)) + + for i, s in enumerate(dsf.str): + pass + + self.assertEqual(i, 100) + self.assertEqual(s, 'h') + + def test_iter_object_try_string(self): + ds = Series([slice(None, randint(10), randint(10, 20)) + for _ in xrange(4)]) + + i, s = 100, 'h' + + for i, s in enumerate(ds.str): + pass + + self.assertEqual(i, 100) + self.assertEqual(s, 'h') + def test_cat(self): one = ['a', 'a', 'b', 'b', 'c', NA] two = ['a', NA, 'b', 'd', 'foo', NA]