Merge pull request pandas-dev#11325 from mjoud/namedtuples

jreback · jreback · commit 8a46de47ea43 · 2015-10-28T07:13:27.000-04:00
ENH: itertuples() returns namedtuples (closes pandas-dev#11269)
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -1211,9 +1211,10 @@ To iterate over the rows of a DataFrame, you can use the following methods:
 * :meth:`~DataFrame.iterrows`: Iterate over the rows of a DataFrame as (index, Series) pairs.
   This converts the rows to Series objects, which can change the dtypes and has some
   performance implications.
-* :meth:`~DataFrame.itertuples`: Iterate over the rows of a DataFrame as tuples of the values.
-  This is a lot faster as :meth:`~DataFrame.iterrows`, and is in most cases preferable to
-  use to iterate over the values of a DataFrame.
+* :meth:`~DataFrame.itertuples`: Iterate over the rows of a DataFrame
+  as namedtuples of the values.  This is a lot faster as
+  :meth:`~DataFrame.iterrows`, and is in most cases preferable to use
+  to iterate over the values of a DataFrame.
 
 .. warning::
 
@@ -1307,7 +1308,7 @@ index value along with a Series containing the data in each row:
       df_orig['int'].dtype
 
    To preserve dtypes while iterating over the rows, it is better
-   to use :meth:`~DataFrame.itertuples` which returns tuples of the values
+   to use :meth:`~DataFrame.itertuples` which returns namedtuples of the values
    and which is generally much faster as ``iterrows``.
 
 For instance, a contrived way to transpose the DataFrame would be:
@@ -1325,9 +1326,9 @@ itertuples
 ~~~~~~~~~~
 
 The :meth:`~DataFrame.itertuples` method will return an iterator
-yielding a tuple for each row in the DataFrame. The first element
-of the tuple will be the row's corresponding index value,
-while the remaining values are the row values.
+yielding a namedtuple for each row in the DataFrame. The first element
+of the tuple will be the row's corresponding index value, while the
+remaining values are the row values.
 
 For instance,
 
@@ -1336,9 +1337,16 @@ For instance,
    for row in df.itertuples():
        print(row)
 
-This method does not convert the row to a Series object but just returns the
-values inside a tuple. Therefore, :meth:`~DataFrame.itertuples` preserves the
-data type of the values and is generally faster as :meth:`~DataFrame.iterrows`.
+This method does not convert the row to a Series object but just
+returns the values inside a namedtuple. Therefore,
+:meth:`~DataFrame.itertuples` preserves the data type of the values
+and is generally faster as :meth:`~DataFrame.iterrows`.
+
+.. note::
+
+   The columns names will be renamed to positional names if they are
+   invalid Python identifiers, repeated, or start with an underscore.
+   With a large number of columns (>255), regular tuples are returned.
 
 .. _basics.dt_accessors:
 
diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt
@@ -38,6 +38,7 @@ API changes
   Legacy Python syntax (``set([x, y])``) (:issue:`11215`)
 - Indexing with a null key will raise a ``TypeError``, instead of a ``ValueError`` (:issue:`11356`)
 - ``Series.sort_index()`` now correctly handles the ``inplace`` option (:issue:`11402`)
+- ``DataFrame.itertuples()`` now returns ``namedtuple`` objects, when possible. (:issue:`11269`)
 
 .. _whatsnew_0171.deprecations:
 
@@ -71,7 +72,7 @@ Bug Fixes
 - Bug in ``HDFStore.append`` with strings whose encoded length exceded the max unencoded length (:issue:`11234`)
 - Bug in merging ``datetime64[ns, tz]`` dtypes (:issue:`11405`)
 - Bug in ``HDFStore.select`` when comparing with a numpy scalar in a where clause (:issue:`11283`)
-- Bug in using ``DataFrame.ix`` with a multi-index indexer(:issue:`11372`) 
+- Bug in using ``DataFrame.ix`` with a multi-index indexer(:issue:`11372`)
 
 
 - Bug in tz-conversions with an ambiguous time and ``.dt`` accessors (:issue:`11295`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -584,7 +584,7 @@ def iteritems(self):
         See also
         --------
         iterrows : Iterate over the rows of a DataFrame as (index, Series) pairs.
-        itertuples : Iterate over the rows of a DataFrame as tuples of the values.
+        itertuples : Iterate over the rows of a DataFrame as namedtuples of the values.
 
         """
         if self.columns.is_unique and hasattr(self, '_item_cache'):
@@ -617,7 +617,7 @@ def iterrows(self):
            int64
 
            To preserve dtypes while iterating over the rows, it is better
-           to use :meth:`itertuples` which returns tuples of the values
+           to use :meth:`itertuples` which returns namedtuples of the values
            and which is generally faster as ``iterrows``.
 
         2. You should **never modify** something you are iterating over.
@@ -632,7 +632,7 @@ def iterrows(self):
 
         See also
         --------
-        itertuples : Iterate over the rows of a DataFrame as tuples of the values.
+        itertuples : Iterate over the rows of a DataFrame as namedtuples of the values.
         iteritems : Iterate over (column name, Series) pairs.
 
         """
@@ -641,15 +641,23 @@ def iterrows(self):
             s = Series(v, index=columns, name=k)
             yield k, s
 
-    def itertuples(self, index=True):
+    def itertuples(self, index=True, name="Pandas"):
         """
-        Iterate over the rows of DataFrame as tuples, with index value
+        Iterate over the rows of DataFrame as namedtuples, with index value
         as first element of the tuple.
 
         Parameters
         ----------
         index : boolean, default True
             If True, return the index as the first element of the tuple.
+        name : string, default "Pandas"
+            The name of the returned namedtuple.
+
+        Notes
+        -----
+        The columns names will be renamed to positional names if they are
+        invalid Python identifiers, repeated, or start with an underscore.
+        With a large number of columns (>255), regular tuples are returned.
 
         See also
         --------
@@ -666,16 +674,32 @@ def itertuples(self, index=True):
         b     2   0.2
         >>> for row in df.itertuples():
         ...     print(row)
-        ('a', 1, 0.10000000000000001)
-        ('b', 2, 0.20000000000000001)
+        ...
+        Pandas(Index='a', col1=1, col2=0.10000000000000001)
+        Pandas(Index='b', col1=2, col2=0.20000000000000001)
 
         """
         arrays = []
+        fields = []
         if index:
             arrays.append(self.index)
+            fields.append("Index")
 
         # use integer indexing because of possible duplicate column names
         arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
+
+        # Python 3 supports at most 255 arguments to constructor, and
+        # things get slow with this many fields in Python 2
+        if len(self.columns) + index < 256:
+            # `rename` is unsupported in Python 2.6
+            try:
+                itertuple = collections.namedtuple(
+                    name, fields+list(self.columns), rename=True)
+                return (itertuple(*row) for row in zip(*arrays))
+            except:
+                pass
+
+        # fallback to regular tuples
         return zip(*arrays)
 
     if compat.PY3:  # pragma: no cover
@@ -1213,7 +1237,7 @@ def to_panel(self):
 
     def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
                columns=None, header=True, index=True, index_label=None,
-               mode='w', encoding=None, compression=None, quoting=None, 
+               mode='w', encoding=None, compression=None, quoting=None,
                quotechar='"', line_terminator='\n', chunksize=None,
                tupleize_cols=False, date_format=None, doublequote=True,
                escapechar=None, decimal='.', **kwds):
@@ -1251,7 +1275,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
             A string representing the encoding to use in the output file,
             defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
         compression : string, optional
-            a string representing the compression to use in the output file, 
+            a string representing the compression to use in the output file,
             allowed values are 'gzip', 'bz2',
             only used when the first argument is a filename
         line_terminator : string, default '\\n'
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -5545,6 +5545,27 @@ def test_itertuples(self):
         dfaa = df[['a', 'a']]
         self.assertEqual(list(dfaa.itertuples()), [(0, 1, 1), (1, 2, 2), (2, 3, 3)])
 
+        tup = next(df.itertuples(name='TestName'))
+
+        # no support for field renaming in Python 2.6, regular tuples are returned
+        if sys.version >= LooseVersion('2.7'):
+            self.assertEqual(tup._fields, ('Index', 'a', 'b'))
+            self.assertEqual((tup.Index, tup.a, tup.b), tup)
+            self.assertEqual(type(tup).__name__, 'TestName')
+
+        df.columns = ['def', 'return']
+        tup2 = next(df.itertuples(name='TestName'))
+        self.assertEqual(tup2, (0, 1, 4))
+
+        if sys.version >= LooseVersion('2.7'):
+            self.assertEqual(tup2._fields, ('Index', '_1', '_2'))
+
+        df3 = DataFrame(dict(('f'+str(i), [i]) for i in range(1024)))
+        # will raise SyntaxError if trying to create namedtuple
+        tup3 = next(df3.itertuples())
+        self.assertFalse(hasattr(tup3, '_fields'))
+        self.assertIsInstance(tup3, tuple)
+
     def test_len(self):
         self.assertEqual(len(self.frame), len(self.frame.index))