Merge pull request pandas-dev#9239 from TomAugspurger/dfTransform

Tom Augspurger · Tom Augspurger · commit c88b0ba05f98 · 2015-03-01T08:51:09.000-06:00
API: Add DataFrame.assign method
diff --git a/doc/source/_static/whatsnew_assign.png b/doc/source/_static/whatsnew_assign.png
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -11,6 +11,7 @@
    from pandas.compat import lrange
    options.display.max_rows=15
 
+
 ==============================
  Essential Basic Functionality
 ==============================
@@ -793,6 +794,7 @@ This is equivalent to the following
    result
    result.loc[:,:,'ItemA']
 
+
 .. _basics.reindexing:
 
 
diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst
@@ -450,6 +450,82 @@ available to insert at a particular location in the columns:
    df.insert(1, 'bar', df['one'])
    df
 
+.. _dsintro.chained_assignment:
+
+Assigning New Columns in Method Chains
+--------------------------------------
+
+.. versionadded:: 0.16.0
+
+Inspired by `dplyr's
+<http://cran.rstudio.com/web/packages/dplyr/vignettes/introduction.html#mutate>`__
+``mutate`` verb, DataFrame has an :meth:`~pandas.DataFrame.assign`
+method that allows you to easily create new columns that are potentially
+derived from existing columns. 
+
+.. ipython:: python
+
+   iris = read_csv('data/iris.data')
+   iris.head()
+
+   (iris.assign(sepal_ratio = iris['SepalWidth'] / iris['SepalLength'])
+        .head())
+
+Above was an example of inserting a precomputed value. We can also pass in
+a function of one argument to be evalutated on the DataFrame being assigned to.
+
+.. ipython:: python
+
+   iris.assign(sepal_ratio = lambda x: (x['SepalWidth'] /
+                                        x['SepalLength'])).head()
+
+``assign`` **always** returns a copy of the data, leaving the original
+DataFrame untouched.
+
+Passing a callable, as opposed to an actual value to be inserted, is
+useful when you don't have a reference to the DataFrame at hand. This is
+common when using ``assign`` in chains of operations. For example,
+we can limit the DataFrame to just those observations with a Sepal Length
+greater than 5, calculate the ratio, and plot:
+
+.. ipython:: python
+
+   @savefig basics_assign.png
+   (iris.query('SepalLength > 5')
+        .assign(SepalRatio = lambda x: x.SepalWidth / x.SepalLength,
+                PetalRatio = lambda x: x.PetalWidth / x.PetalLength)
+        .plot(kind='scatter', x='SepalRatio', y='PetalRatio'))
+
+Since a function is passed in, the function is computed on the DataFrame
+being assigned to. Importantly, this is the DataFrame that's been filtered
+to those rows with sepal length greater than 5. The filtering happens first,
+and then the ratio calculations. This is an example where we didn't
+have a reference to the *filtered* DataFrame available.
+
+The function signature for ``assign`` is simply ``**kwargs``. The keys
+are the column names for the new fields, and the values are either a value
+to be inserted (for example, a ``Series`` or NumPy array), or a function
+of one argument to be called on the ``DataFrame``. A *copy* of the original
+DataFrame is returned, with the new values inserted.
+
+.. warning::
+
+  Since the function signature of ``assign`` is ``**kwargs``, a dictionary,
+  the order of the new columns in the resulting DataFrame cannot be guaranteed.
+
+  All expressions are computed first, and then assigned. So you can't refer
+  to another column being assigned in the same call to ``assign``. For example:
+
+   .. ipython::
+       :verbatim:
+
+       In [1]: # Don't do this, bad reference to `C`
+               df.assign(C = lambda x: x['A'] + x['B'],
+                         D = lambda x: x['A'] + x['C'])
+       In [2]: # Instead, break it into two assigns
+               (df.assign(C = lambda x: x['A'] + x['B'])
+                  .assign(D = lambda x: x['A'] + x['C']))
+
 Indexing / Selection
 ~~~~~~~~~~~~~~~~~~~~
 The basics of indexing are as follows:
diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
@@ -29,6 +29,47 @@ New features
 
   This method is also exposed by the lower level ``Index.get_indexer`` and ``Index.get_loc`` methods.
 
+- DataFrame assign method
+
+Inspired by `dplyr's
+<http://cran.rstudio.com/web/packages/dplyr/vignettes/introduction.html#mutate>`__ ``mutate`` verb, DataFrame has a new
+:meth:`~pandas.DataFrame.assign` method.
+The function signature for ``assign`` is simply ``**kwargs``. The keys
+are the column names for the new fields, and the values are either a value
+to be inserted (for example, a ``Series`` or NumPy array), or a function
+of one argument to be called on the ``DataFrame``. The new values are inserted,
+and the entire DataFrame (with all original and new columns) is returned.
+
+.. ipython :: python
+
+   iris = read_csv('data/iris.data')
+   iris.head()
+
+   iris.assign(sepal_ratio=iris['SepalWidth'] / iris['SepalLength']).head()
+
+Above was an example of inserting a precomputed value. We can also pass in
+a function to be evalutated.
+
+.. ipython :: python
+
+    iris.assign(sepal_ratio = lambda x: (x['SepalWidth'] /
+                                         x['SepalLength'])).head()
+
+The power of ``assign`` comes when used in chains of operations. For example,
+we can limit the DataFrame to just those with a Sepal Length greater than 5,
+calculate the ratio, and plot
+
+.. ipython:: python
+
+   (iris.query('SepalLength > 5')
+        .assign(SepalRatio = lambda x: x.SepalWidth / x.SepalLength,
+                PetalRatio = lambda x: x.PetalWidth / x.PetalLength)
+        .plot(kind='scatter', x='SepalRatio', y='PetalRatio'))
+
+.. image:: _static/whatsnew_assign.png
+    
+See the :ref:`documentation <dsintro.chained_assignment>` for more. (:issue:`9229`)
+
 .. _whatsnew_0160.api:
 
 .. _whatsnew_0160.api_breaking:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2220,6 +2220,88 @@ def insert(self, loc, column, value, allow_duplicates=False):
         self._data.insert(
             loc, column, value, allow_duplicates=allow_duplicates)
 
+    def assign(self, **kwargs):
+        """
+        Assign new columns to a DataFrame, returning a new object
+        (a copy) with all the original columns in addition to the new ones.
+
+        .. versionadded:: 0.16.0
+
+        Parameters
+        ----------
+        kwargs : keyword, value pairs
+            keywords are the column names. If the values are
+            callable, they are computed on the DataFrame and
+            assigned to the new columns. If the values are
+            not callable, (e.g. a Series, scalar, or array),
+            they are simply assigned.
+
+        Returns
+        -------
+        df : DataFrame
+            A new DataFrame with the new columns in addition to
+            all the existing columns.
+
+        Notes
+        -----
+        Since ``kwargs`` is a dictionary, the order of your
+        arguments may not be preserved, and so the order of the
+        new columns is not well defined. Assigning multiple
+        columns within the same ``assign`` is possible, but you cannot
+        reference other columns created within the same ``assign`` call.
+
+        Examples
+        --------
+        >>> df = DataFrame({'A': range(1, 11), 'B': np.random.randn(10)})
+
+        Where the value is a callable, evaluated on `df`:
+
+        >>> df.assign(ln_A = lambda x: np.log(x.A))
+            A         B      ln_A
+        0   1  0.426905  0.000000
+        1   2 -0.780949  0.693147
+        2   3 -0.418711  1.098612
+        3   4 -0.269708  1.386294
+        4   5 -0.274002  1.609438
+        5   6 -0.500792  1.791759
+        6   7  1.649697  1.945910
+        7   8 -1.495604  2.079442
+        8   9  0.549296  2.197225
+        9  10 -0.758542  2.302585
+
+        Where the value already exists and is inserted:
+
+        >>> newcol = np.log(df['A'])
+        >>> df.assign(ln_A=newcol)
+            A         B      ln_A
+        0   1  0.426905  0.000000
+        1   2 -0.780949  0.693147
+        2   3 -0.418711  1.098612
+        3   4 -0.269708  1.386294
+        4   5 -0.274002  1.609438
+        5   6 -0.500792  1.791759
+        6   7  1.649697  1.945910
+        7   8 -1.495604  2.079442
+        8   9  0.549296  2.197225
+        9  10 -0.758542  2.302585
+        """
+        data = self.copy()
+
+        # do all calculations first...
+        results = {}
+        for k, v in kwargs.items():
+
+            if callable(v):
+                results[k] = v(data)
+            else:
+                results[k] = v
+
+        # ... and then assign
+        for k, v in results.items():
+            data[k] = v
+
+        return data
+
     def _sanitize_column(self, key, value):
         # Need to make sure new columns (which go into the BlockManager as new
         # blocks) are always copied
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -13965,6 +13965,60 @@ def test_select_dtypes_bad_arg_raises(self):
         with tm.assertRaisesRegexp(TypeError, 'data type.*not understood'):
             df.select_dtypes(['blargy, blarg, blarg'])
 
+    def test_assign(self):
+        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+        original = df.copy()
+        result = df.assign(C=df.B / df.A)
+        expected = df.copy()
+        expected['C'] = [4, 2.5, 2]
+        assert_frame_equal(result, expected)
+
+        # lambda syntax
+        result = df.assign(C=lambda x: x.B / x.A)
+        assert_frame_equal(result, expected)
+
+        # original is unmodified
+        assert_frame_equal(df, original)
+
+        # Non-Series array-like
+        result = df.assign(C=[4, 2.5, 2])
+        assert_frame_equal(result, expected)
+        # original is unmodified
+        assert_frame_equal(df, original)
+
+        result = df.assign(B=df.B / df.A)
+        expected = expected.drop('B', axis=1).rename(columns={'C': 'B'})
+        assert_frame_equal(result, expected)
+
+        # overwrite
+        result = df.assign(A=df.A + df.B)
+        expected = df.copy()
+        expected['A'] = [5, 7, 9]
+        assert_frame_equal(result, expected)
+
+        # lambda
+        result = df.assign(A=lambda x: x.A + x.B)
+        assert_frame_equal(result, expected)
+
+    def test_assign_multiple(self):
+        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+        result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B)
+        expected = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9],
+                              'D': [1, 2, 3], 'E': [4, 5, 6]})
+        # column order isn't preserved
+        assert_frame_equal(result.reindex_like(expected), expected)
+
+    def test_assign_bad(self):
+        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+        # non-keyword argument
+        with tm.assertRaises(TypeError):
+            df.assign(lambda x: x.A)
+        with tm.assertRaises(AttributeError):
+            df.assign(C=df.A, D=df.A + df.C)
+        with tm.assertRaises(KeyError):
+            df.assign(C=lambda df: df.A, D=lambda df: df['A'] + df['C'])
+        with tm.assertRaises(KeyError):
+            df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
 
 def skip_if_no_ne(engine='numexpr'):
     if engine == 'numexpr':