Skip to content

Commit 469ed4c

Browse files
author
datajanko
committed
ENH: df.assign accepting dependent **kwargs (#14207)
Specifically, 'df.assign(b=1, c=lambda x:x['b'])' does not throw an exception in python 3.6 and above. Further details are discussed in Issues #14207 and #18797.
1 parent b9cc821 commit 469ed4c

File tree

4 files changed

+95
-18
lines changed

4 files changed

+95
-18
lines changed

doc/source/whatsnew/v0.22.0.txt

-5
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,3 @@ This is a major release from 0.21.1 and includes a number of API changes,
77
deprecations, new features, enhancements, and performance improvements along
88
with a large number of bug fixes. We recommend that all users upgrade to this
99
version.
10-
11-
.. _whatsnew_0220.api_breaking:
12-
13-
Backwards incompatible API changes
14-
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

doc/source/whatsnew/v0.23.0.txt

+53
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,59 @@ Current Behavior
119119

120120
s.rank(na_option='top')
121121

122+
123+
.. _whatsnew_0230.enhancements.assign_dependent:
124+
125+
``.assign()`` accepts dependent arguments
126+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
127+
128+
The :func:`DataFrame.assign()` now accepts dependent kwargs. In earlier versions this throws a Keyerror exception anymore. (:issue: `14207)
129+
130+
Specifically, defining a new column inside assign may be referenced in the same assign statement if a callable is used. For example
131+
132+
.. code-block:: ipython
133+
134+
In [3]: df = pd.DataFrame({'A': [1, 2, 3]})
135+
136+
In [4]: df.assign(B=df.A, C=lambda x:x['A']+ x['B'])
137+
Out[4]:
138+
A B C
139+
0 1 1 2
140+
1 2 2 4
141+
2 3 3 6
142+
143+
.. warning::
144+
145+
This may subtly change the behavior of your code when you're
146+
using ``assign`` to update an existing column. Previously, callables
147+
refering to other variables being updated would get the "old" values
148+
149+
.. code-block:: ipython
150+
151+
In [2]: df = pd.DataFrame({"A": [1, 2, 3]})
152+
153+
In [3]: df.assign(A=lambda df: df.A + 1, C=lambda df: df.A * -1)
154+
Out[3]:
155+
A C
156+
0 2 -1
157+
1 3 -2
158+
2 4 -3
159+
160+
Now, callables will get the "new" value
161+
162+
.. code-block:: ipython
163+
164+
165+
In [4]: df = pd.DataFrame({"A": [1, 2, 3]})
166+
167+
In [5]: df.assign(A=df.A+1, C= lambda df: df.A* -1)
168+
Out[5]:
169+
A C
170+
0 2 -2
171+
1 3 -3
172+
2 4 -4
173+
174+
122175
.. _whatsnew_0230.enhancements.other:
123176

124177
Other Enhancements

pandas/core/frame.py

+17-12
Original file line numberDiff line numberDiff line change
@@ -2659,8 +2659,11 @@ def assign(self, **kwargs):
26592659
\*\*kwargs. For python 3.5 and earlier, since \*\*kwargs is unordered,
26602660
the columns are inserted in alphabetical order at the end of your
26612661
DataFrame. Assigning multiple columns within the same ``assign``
2662-
is possible, but you cannot reference other columns created within
2663-
the same ``assign`` call.
2662+
is possible, but for python 3.5 and earlier, you cannot reference
2663+
other columns created within the same ``assign`` call.
2664+
For python 3.6 and above it is possible to reference columns created
2665+
in an assignment. To this end you have to respect the order of kwargs
2666+
and use callables referencing the assigned columns.
26642667
26652668
Examples
26662669
--------
@@ -2699,19 +2702,21 @@ def assign(self, **kwargs):
26992702
"""
27002703
data = self.copy()
27012704

2702-
# do all calculations first...
2703-
results = OrderedDict()
2704-
for k, v in kwargs.items():
2705-
results[k] = com._apply_if_callable(v, data)
2706-
2707-
# preserve order for 3.6 and later, but sort by key for 3.5 and earlier
2705+
# for 3.6 preserve order of kwargs
27082706
if PY36:
2709-
results = results.items()
2707+
for k, v in kwargs.items():
2708+
data[k] = com._apply_if_callable(v, data)
27102709
else:
2710+
# for 3.5 or earlier: do all calculations first...
2711+
results = OrderedDict()
2712+
for k, v in kwargs.items():
2713+
results[k] = com._apply_if_callable(v, data)
2714+
2715+
# sort by key for 3.5 and earlier
27112716
results = sorted(results.items())
2712-
# ... and then assign
2713-
for k, v in results:
2714-
data[k] = v
2717+
# ... and then assign
2718+
for k, v in results:
2719+
data[k] = v
27152720
return data
27162721

27172722
def _sanitize_column(self, key, value, broadcast=True):

pandas/tests/frame/test_mutate_columns.py

+25-1
Original file line numberDiff line numberDiff line change
@@ -89,11 +89,35 @@ def test_assign_bad(self):
8989
df.assign(lambda x: x.A)
9090
with pytest.raises(AttributeError):
9191
df.assign(C=df.A, D=df.A + df.C)
92+
93+
@pytest.mark.skipif(PY36, reason="""Issue #14207: valid for python
94+
3.6 and above""")
95+
def test_assign_bad_old_version(self):
96+
df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
97+
98+
# Key C does not exist at defition time of df
9299
with pytest.raises(KeyError):
93-
df.assign(C=lambda df: df.A, D=lambda df: df['A'] + df['C'])
100+
df.assign(C=lambda df: df.A,
101+
D=lambda df: df['A'] + df['C'])
94102
with pytest.raises(KeyError):
95103
df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
96104

105+
@pytest.mark.skipif(not PY36, reason="""Issue #14207: not valid for
106+
python 3.5 and below""")
107+
def test_assign_dependent(self):
108+
df = DataFrame({'A': [1, 2], 'B': [3, 4]})
109+
110+
result = df.assign(C=df.A, D=lambda x: x['A'] + x['C'])
111+
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
112+
columns=list('ABCD'))
113+
assert_frame_equal(result, expected)
114+
115+
result = df.assign(C=lambda df: df.A,
116+
D=lambda df: df['A'] + df['C'])
117+
expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]],
118+
columns=list('ABCD'))
119+
assert_frame_equal(result, expected)
120+
97121
def test_insert_error_msmgs(self):
98122

99123
# GH 7432

0 commit comments

Comments
 (0)