ENH: df.assign accepting dependent **kwargs (#14207)

datajanko · datajanko · commit 469ed4ca2bcf · 2017-12-23T21:52:03.000+01:00
Specifically, 'df.assign(b=1, c=lambda x:x['b'])' does not throw an exception in python 3.6 and above. Further details are discussed in Issues #14207 and #18797.
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -7,8 +7,3 @@ This is a major release from 0.21.1 and includes a number of API changes,
 deprecations, new features, enhancements, and performance improvements along
 with a large number of bug fixes. We recommend that all users upgrade to this
 version.
-
-.. _whatsnew_0220.api_breaking:
-
-Backwards incompatible API changes
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -119,6 +119,59 @@ Current Behavior
 
     s.rank(na_option='top')
 
+
+.. _whatsnew_0230.enhancements.assign_dependent:
+
+``.assign()`` accepts dependent arguments
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The :func:`DataFrame.assign()` now accepts dependent kwargs. In earlier versions this throws a Keyerror exception anymore. (:issue: `14207)
+
+Specifically, defining a new column inside assign may be referenced in the same assign statement if a callable is used. For example
+
+.. code-block:: ipython     
+
+    In [3]: df = pd.DataFrame({'A': [1, 2, 3]})
+
+    In [4]: df.assign(B=df.A, C=lambda x:x['A']+ x['B'])
+    Out[4]:
+       A  B  C
+    0  1  1  2
+    1  2  2  4
+    2  3  3  6
+
+.. warning::
+
+This may subtly change the behavior of your code when you're
+using ``assign`` to update an existing column. Previously, callables
+refering to other variables being updated would get the "old" values
+
+.. code-block:: ipython
+
+    In [2]: df = pd.DataFrame({"A": [1, 2, 3]})
+
+    In [3]: df.assign(A=lambda df: df.A + 1, C=lambda df: df.A * -1)
+    Out[3]:
+       A  C
+    0  2 -1
+    1  3 -2
+    2  4 -3
+
+Now, callables will get the "new" value
+
+.. code-block:: ipython
+
+  
+    In [4]: df = pd.DataFrame({"A": [1, 2, 3]})
+
+    In [5]: df.assign(A=df.A+1, C= lambda df: df.A* -1)
+    Out[5]:
+       A  C
+    0  2 -2
+    1  3 -3
+    2  4 -4
+
+
 .. _whatsnew_0230.enhancements.other:
 
 Other Enhancements
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2659,8 +2659,11 @@ def assign(self, **kwargs):
         \*\*kwargs. For python 3.5 and earlier, since \*\*kwargs is unordered,
         the columns are inserted in alphabetical order at the end of your
         DataFrame.  Assigning multiple columns within the same ``assign``
-        is possible, but you cannot reference other columns created within
-        the same ``assign`` call.
+        is possible, but for python 3.5 and earlier, you cannot reference
+        other columns created within the same ``assign`` call.
+        For python 3.6 and above it is possible to reference columns created
+        in an assignment. To this end you have to respect the order of kwargs
+        and use callables referencing the assigned columns.
 
         Examples
         --------
@@ -2699,19 +2702,21 @@ def assign(self, **kwargs):
         """
         data = self.copy()
 
-        # do all calculations first...
-        results = OrderedDict()
-        for k, v in kwargs.items():
-            results[k] = com._apply_if_callable(v, data)
-
-        # preserve order for 3.6 and later, but sort by key for 3.5 and earlier
+        # for 3.6 preserve order of kwargs
         if PY36:
-            results = results.items()
+            for k, v in kwargs.items():
+                data[k] = com._apply_if_callable(v, data)
         else:
+            # for 3.5 or earlier: do all calculations first...
+            results = OrderedDict()
+            for k, v in kwargs.items():
+                results[k] = com._apply_if_callable(v, data)
+
+            # sort by key for 3.5 and earlier
             results = sorted(results.items())
-        # ... and then assign
-        for k, v in results:
-            data[k] = v
+            # ... and then assign
+            for k, v in results:
+                data[k] = v
         return data
 
     def _sanitize_column(self, key, value, broadcast=True):
diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py
@@ -89,11 +89,35 @@ def test_assign_bad(self):
             df.assign(lambda x: x.A)
         with pytest.raises(AttributeError):
             df.assign(C=df.A, D=df.A + df.C)
+
+    @pytest.mark.skipif(PY36, reason="""Issue #14207: valid for python
+                        3.6 and above""")
+    def test_assign_bad_old_version(self):
+        df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
+
+        # Key C does not exist at defition time of df
         with pytest.raises(KeyError):
-            df.assign(C=lambda df: df.A, D=lambda df: df['A'] + df['C'])
+            df.assign(C=lambda df: df.A,
+                      D=lambda df: df['A'] + df['C'])
         with pytest.raises(KeyError):
             df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
 
+    @pytest.mark.skipif(not PY36, reason="""Issue #14207: not valid for
+                        python 3.5 and below""")
+    def test_assign_dependent(self):
+        df = DataFrame({'A': [1, 2], 'B': [3, 4]})
+
+        result = df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
+        expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
+                             columns=list('ABCD'))
+        assert_frame_equal(result, expected)
+
+        result = df.assign(C=lambda df: df.A,
+                           D=lambda df: df['A'] + df['C'])
+        expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
+                             columns=list('ABCD'))
+        assert_frame_equal(result, expected)
+
     def test_insert_error_msmgs(self):
 
         # GH 7432