diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index d7650b6b0938f..78e2fdb46f659 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -95,7 +95,7 @@ constructed from the sorted keys of the dict, if possible. NaN (not a number) is the standard missing data marker used in pandas. -**From scalar value** +**From scalar value** If ``data`` is a scalar value, an index must be provided. The value will be repeated to match the length of **index**. @@ -154,7 +154,7 @@ See also the :ref:`section on attribute access`. Vectorized operations and label alignment with Series ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When working with raw NumPy arrays, looping through value-by-value is usually +When working with raw NumPy arrays, looping through value-by-value is usually not necessary. The same is true when working with Series in pandas. Series can also be passed into most NumPy methods expecting an ndarray. @@ -324,7 +324,7 @@ From a list of dicts From a dict of tuples ~~~~~~~~~~~~~~~~~~~~~ -You can automatically create a multi-indexed frame by passing a tuples +You can automatically create a multi-indexed frame by passing a tuples dictionary. .. ipython:: python @@ -347,7 +347,7 @@ column name provided). **Missing Data** Much more will be said on this topic in the :ref:`Missing data ` -section. To construct a DataFrame with missing data, we use ``np.nan`` to +section. To construct a DataFrame with missing data, we use ``np.nan`` to represent missing values. Alternatively, you may pass a ``numpy.MaskedArray`` as the data argument to the DataFrame constructor, and its masked entries will be considered missing. @@ -370,7 +370,7 @@ set to ``'index'`` in order to use the dict keys as row labels. ``DataFrame.from_records`` takes a list of tuples or an ndarray with structured dtype. It works analogously to the normal ``DataFrame`` constructor, except that -the resulting DataFrame index may be a specific field of the structured +the resulting DataFrame index may be a specific field of the structured dtype. For example: .. ipython:: python @@ -506,25 +506,70 @@ to be inserted (for example, a ``Series`` or NumPy array), or a function of one argument to be called on the ``DataFrame``. A *copy* of the original DataFrame is returned, with the new values inserted. +.. versionmodified:: 0.23.0 + +Starting with Python 3.6 the order of ``**kwargs`` is preserved. This allows +for *dependent* assignment, where an expression later in ``**kwargs`` can refer +to a column created earlier in the same :meth:`~DataFrame.assign`. + +.. ipython:: python + + dfa = pd.DataFrame({"A": [1, 2, 3], + "B": [4, 5, 6]}) + dfa.assign(C=lambda x: x['A'] + x['B'], + D=lambda x: x['A'] + x['C']) + +In the second expression, ``x['C']`` will refer to the newly created column, +that's equal to ``dfa['A'] + dfa['B']``. + +To write code compatible with all versions of Python, split the assignment in two. + +.. ipython:: python + + dependent = pd.DataFrame({"A": [1, 1, 1]}) + (dependent.assign(A=lambda x: x['A'] + 1) + .assign(B=lambda x: x['A'] + 2)) + .. warning:: - Since the function signature of ``assign`` is ``**kwargs``, a dictionary, - the order of the new columns in the resulting DataFrame cannot be guaranteed - to match the order you pass in. To make things predictable, items are inserted - alphabetically (by key) at the end of the DataFrame. + Dependent assignment maybe subtly change the behavior of your code between + Python 3.6 and older versions of Python. + + If you wish write code that supports versions of python before and after 3.6, + you'll need to take care when passing ``assign`` expressions that + + * Updating an existing column + * Refering to the newly updated column in the same ``assign`` + + For example, we'll update column "A" and then refer to it when creating "B". + + .. code-block:: python + + >>> dependent = pd.DataFrame({"A": [1, 1, 1]}) + >>> dependent.assign(A=lambda x: x["A"] + 1, + B=lambda x: x["A"] + 2) + + For Python 3.5 and earlier the expression creating ``B`` refers to the + "old" value of ``A``, ``[1, 1, 1]``. The output is then + + .. code-block:: python + + A B + 0 2 3 + 1 2 3 + 2 2 3 + + For Python 3.6 and later, the expression creating ``A`` refers to the + "new" value of ``A``, ``[2, 2, 2]``, which results in + + .. code-block:: python - All expressions are computed first, and then assigned. So you can't refer - to another column being assigned in the same call to ``assign``. For example: + A B + 0 2 4 + 1 2 4 + 2 2 4 - .. ipython:: - :verbatim: - In [1]: # Don't do this, bad reference to `C` - df.assign(C = lambda x: x['A'] + x['B'], - D = lambda x: x['A'] + x['C']) - In [2]: # Instead, break it into two assigns - (df.assign(C = lambda x: x['A'] + x['B']) - .assign(D = lambda x: x['A'] + x['C'])) Indexing / Selection ~~~~~~~~~~~~~~~~~~~~ @@ -914,7 +959,7 @@ For example, using the earlier example data, we could do: Squeezing ~~~~~~~~~ -Another way to change the dimensionality of an object is to ``squeeze`` a 1-len +Another way to change the dimensionality of an object is to ``squeeze`` a 1-len object, similar to ``wp['Item1']``. .. ipython:: python diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index cf5a44442045b..db5c79dcb3c42 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -248,6 +248,46 @@ Current Behavior: pd.RangeIndex(1, 5) / 0 +.. _whatsnew_0230.enhancements.assign_dependent: + +``.assign()`` accepts dependent arguments +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :func:`DataFrame.assign` now accepts dependent keyword arguments for python version later than 3.6 (see also `PEP 468 +`_). Later keyword arguments may now refer to earlier ones if the argument is a callable. See the +:ref:`documentation here ` (:issue:`14207`) + +.. ipython:: python + + df = pd.DataFrame({'A': [1, 2, 3]}) + df + df.assign(B=df.A, C=lambda x:x['A']+ x['B']) + +.. warning:: + + This may subtly change the behavior of your code when you're + using ``.assign()`` to update an existing column. Previously, callables + referring to other variables being updated would get the "old" values + + Previous Behaviour: + + .. code-block:: ipython + + In [2]: df = pd.DataFrame({"A": [1, 2, 3]}) + + In [3]: df.assign(A=lambda df: df.A + 1, C=lambda df: df.A * -1) + Out[3]: + A C + 0 2 -1 + 1 3 -2 + 2 4 -3 + + New Behaviour: + + .. ipython:: python + + df.assign(A=df.A+1, C= lambda df: df.A* -1) + .. _whatsnew_0230.enhancements.other: Other Enhancements diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6d8dcb8a1ca89..c99c59db1d8cb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2687,12 +2687,17 @@ def assign(self, **kwargs): Notes ----- - For python 3.6 and above, the columns are inserted in the order of - \*\*kwargs. For python 3.5 and earlier, since \*\*kwargs is unordered, - the columns are inserted in alphabetical order at the end of your - DataFrame. Assigning multiple columns within the same ``assign`` - is possible, but you cannot reference other columns created within - the same ``assign`` call. + Assigning multiple columns within the same ``assign`` is possible. + For Python 3.6 and above, later items in '\*\*kwargs' may refer to + newly created or modified columns in 'df'; items are computed and + assigned into 'df' in order. For Python 3.5 and below, the order of + keyword arguments is not specified, you cannot refer to newly created + or modified columns. All items are computed first, and then assigned + in alphabetical order. + + .. versionmodified :: 0.23.0 + + Keyword argument order is maintained for Python 3.6 and later. Examples -------- @@ -2728,22 +2733,34 @@ def assign(self, **kwargs): 7 8 -1.495604 2.079442 8 9 0.549296 2.197225 9 10 -0.758542 2.302585 + + Where the keyword arguments depend on each other + + >>> df = pd.DataFrame({'A': [1, 2, 3]}) + + >>> df.assign(B=df.A, C=lambda x:x['A']+ x['B']) + A B C + 0 1 1 2 + 1 2 2 4 + 2 3 3 6 """ data = self.copy() - # do all calculations first... - results = OrderedDict() - for k, v in kwargs.items(): - results[k] = com._apply_if_callable(v, data) - - # preserve order for 3.6 and later, but sort by key for 3.5 and earlier + # >= 3.6 preserve order of kwargs if PY36: - results = results.items() + for k, v in kwargs.items(): + data[k] = com._apply_if_callable(v, data) else: + # <= 3.5: do all calculations first... + results = OrderedDict() + for k, v in kwargs.items(): + results[k] = com._apply_if_callable(v, data) + + # <= 3.5 and earlier results = sorted(results.items()) - # ... and then assign - for k, v in results: - data[k] = v + # ... and then assign + for k, v in results: + data[k] = v return data def _sanitize_column(self, key, value, broadcast=True): diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 9acdf2f17d86a..8236a41d00243 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -89,11 +89,35 @@ def test_assign_bad(self): df.assign(lambda x: x.A) with pytest.raises(AttributeError): df.assign(C=df.A, D=df.A + df.C) + + @pytest.mark.skipif(PY36, reason="""Issue #14207: valid for python + 3.6 and above""") + def test_assign_dependent_old_python(self): + df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + + # Key C does not exist at defition time of df with pytest.raises(KeyError): - df.assign(C=lambda df: df.A, D=lambda df: df['A'] + df['C']) + df.assign(C=lambda df: df.A, + D=lambda df: df['A'] + df['C']) with pytest.raises(KeyError): df.assign(C=df.A, D=lambda x: x['A'] + x['C']) + @pytest.mark.skipif(not PY36, reason="""Issue #14207: not valid for + python 3.5 and below""") + def test_assign_dependent(self): + df = DataFrame({'A': [1, 2], 'B': [3, 4]}) + + result = df.assign(C=df.A, D=lambda x: x['A'] + x['C']) + expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], + columns=list('ABCD')) + assert_frame_equal(result, expected) + + result = df.assign(C=lambda df: df.A, + D=lambda df: df['A'] + df['C']) + expected = DataFrame([[1, 3, 1, 2], [2, 4, 2, 4]], + columns=list('ABCD')) + assert_frame_equal(result, expected) + def test_insert_error_msmgs(self): # GH 7432