pandas-dev · jreback · Feb 15, 2021 · Feb 7, 2021 · jbrockmendel · Feb 14, 2021
diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst
@@ -178,6 +178,75 @@ To test for membership in the values, use the method :meth:`~pandas.Series.isin`
 For ``DataFrames``, likewise, ``in`` applies to the column axis,
 testing for membership in the list of column names.
 
+.. _udf-mutation:
+
+Mutating with User Defined Function (UDF) methods
+-------------------------------------------------
+
+It is a general rule in programming that one should not mutate a container
+while it is being iterated over. Mutation will invalidate the iterator,
+causing unexpected behavior. Consider the example:
+
+.. ipython:: python
+
+   values = [0, 1, 2, 3, 4, 5]
+   n_removed = 0
+   for k, value in enumerate(values):
+       idx = k - n_removed
+       if value % 2 == 1:
+           del values[idx]
+           n_removed += 1
+       else:
+           values[idx] = value + 1
+   values
+
+One probably would have expected that the result would be ``[1, 3, 5]``.
+When using a pandas method that takes a UDF, internally pandas is often
+iterating over the
+``DataFrame`` or other pandas object. Therefore, if the UDF mutates (changes)
+the ``DataFrame``, unexpected behavior can arise.
+
+Here is a similar example with :meth:`DataFrame.apply`:
+
+.. ipython:: python
+
+   def f(s):
+       s.pop("a")
+       return s
+
+   df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+   try:
+       df.apply(f, axis="columns")
+   except Exception as err:
+       print(repr(err))
+
+To resolve this issue, one can make a copy so that the mutation does
+not apply to the container being iterated over.
+
+.. ipython:: python
+
+   values = [0, 1, 2, 3, 4, 5]
+   n_removed = 0
+   for k, value in enumerate(values.copy()):
+       idx = k - n_removed
+       if value % 2 == 1:
+           del values[idx]
+           n_removed += 1
+       else:
+           values[idx] = value + 1
+   values
+
+.. ipython:: python
+
+   def f(s):
+       s = s.copy()
+       s.pop("a")
+       return s
+
+   df = pd.DataFrame({"a": [1, 2, 3], 'b': [4, 5, 6]})
+   df.apply(f, axis="columns")
+
+
 ``NaN``, Integer ``NA`` values and ``NA`` type promotions
 ---------------------------------------------------------
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -7814,6 +7814,12 @@ def apply(
         DataFrame.aggregate: Only perform aggregating type operations.
         DataFrame.transform: Only perform transforming type operations.
 
+        Notes
+        -----
+        Functions that mutate the passed object can produce unexpected
+        behavior or errors and are not supported. See :ref:`udf-mutation`
+        for more details.
+
         Examples
         --------
         >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -580,6 +580,12 @@ def filter(self, func, dropna=True, *args, **kwargs):
         dropna : Drop groups that do not pass the filter. True by default;
             if False, groups that evaluate False are filled with NaNs.
 
+        Notes
+        -----
+        Functions that mutate the passed object can produce unexpected
+        behavior or errors and are not supported. See :ref:`udf-mutation`
+        for more details.
+
         Examples
         --------
         >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
@@ -1506,6 +1512,10 @@ def filter(self, func, dropna=True, *args, **kwargs):
         Each subframe is endowed the attribute 'name' in case you need to know
         which group you are working on.
 
+        Functions that mutate the passed object can produce unexpected
+        behavior or errors and are not supported. See :ref:`udf-mutation`
+        for more details.
+
         Examples
         --------
         >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -344,7 +344,7 @@ class providing the base-class of operations.
   in the subframe. If f also supports application to the entire subframe,
   then a fast path is used starting from the second chunk.
 * f must not mutate groups. Mutation is not supported and may
-  produce unexpected results.
+  produce unexpected results. See :ref:`udf-mutation` for more details.
 
 When using ``engine='numba'``, there will be no "fall back" behavior internally.
 The group data and group index will be passed as numpy arrays to the JITed
@@ -447,6 +447,10 @@ class providing the base-class of operations.
 The group data and group index will be passed as numpy arrays to the JITed
 user defined function, and no alternative execution attempts will be tried.
 {examples}
+
+Functions that mutate the passed object can produce unexpected
+behavior or errors and are not supported. See :ref:`udf-mutation`
+for more details.
 """
 
 

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -4044,6 +4044,12 @@ def apply(
         Series.agg: Only perform aggregating type operations.
         Series.transform: Only perform transforming type operations.
 
+        Notes
+        -----
+        Functions that mutate the passed object can produce unexpected
+        behavior or errors and are not supported. See :ref:`udf-mutation`
+        for more details.
+
         Examples
         --------
         Create a series with typical summer temperatures for each city.

diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py
@@ -41,6 +41,10 @@
 -----
 `agg` is an alias for `aggregate`. Use the alias.
 
+Functions that mutate the passed object can produce unexpected
+behavior or errors and are not supported. See :ref:`udf-mutation`
+for more details.
+
 A passed user-defined-function will be passed a Series for evaluation.
 {examples}"""
 
@@ -296,6 +300,12 @@
 {klass}.agg : Only perform aggregating type operations.
 {klass}.apply : Invoke function on a {klass}.
 
+Notes
+-----
+Functions that mutate the passed object can produce unexpected
+behavior or errors and are not supported. See :ref:`udf-mutation`
+for more details.
+
 Examples
 --------
 >>> df = pd.DataFrame({{'A': range(3), 'B': range(1, 4)}})