Merge pull request #7845 from jreback/setting_with_copy

jreback · jreback · commit b88d01411761 · 2014-07-26T08:47:09.000-04:00
COMPAT: SettingWithCopy will now warn when slices which can generate views are then set
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -91,6 +91,22 @@ API changes
         df
         df.dtypes
 
+- ``SettingWithCopy`` raise/warnings (according to the option ``mode.chained_assignment``) will now be issued when setting a value on a sliced mixed-dtype DataFrame using chained-assignment. (:issue:`7845`)
+
+  .. code-block:: python
+
+     In [1]: df = DataFrame(np.arange(0,9), columns=['count'])
+
+     In [2]: df['group'] = 'b'
+
+     In [3]: df.iloc[0:5]['group'] = 'a'
+     /usr/local/bin/ipython:1: SettingWithCopyWarning:
+     A value is trying to be set on a copy of a slice from a DataFrame.
+     Try using .loc[row_indexer,col_indexer] = value instead
+
+     See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
+
+
 .. _whatsnew_0150.cat:
 
 Categoricals in Series/DataFrame
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2075,21 +2075,20 @@ def _set_item(self, key, value):
         Add series to DataFrame in specified column.
 
         If series is a numpy-array (not a Series/TimeSeries), it must be the
-        same length as the DataFrame's index or an error will be thrown.
+        same length as the DataFrames index or an error will be thrown.
 
-        Series/TimeSeries will be conformed to the DataFrame's index to
+        Series/TimeSeries will be conformed to the DataFrames index to
         ensure homogeneity.
         """
 
-        is_existing = key in self.columns
         self._ensure_valid_index(value)
         value = self._sanitize_column(key, value)
         NDFrame._set_item(self, key, value)
 
         # check if we are modifying a copy
         # try to set first as we want an invalid
         # value exeption to occur first
-        if is_existing:
+        if len(self):
             self._check_setitem_copy()
 
     def insert(self, loc, column, value, allow_duplicates=False):
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1133,7 +1133,13 @@ def _slice(self, slobj, axis=0, typ=None):
 
         """
         axis = self._get_block_manager_axis(axis)
-        return self._constructor(self._data.get_slice(slobj, axis=axis))
+        result = self._constructor(self._data.get_slice(slobj, axis=axis))
+
+        # this could be a view
+        # but only in a single-dtyped view slicable case
+        is_copy = axis!=0 or result._is_view
+        result._set_is_copy(self, copy=is_copy)
+        return result
 
     def _set_item(self, key, value):
         self._data.set(key, value)
@@ -1149,10 +1155,28 @@ def _set_is_copy(self, ref=None, copy=True):
                 self.is_copy = None
 
     def _check_setitem_copy(self, stacklevel=4, t='setting'):
-        """ validate if we are doing a settitem on a chained copy.
+        """
+
+        validate if we are doing a settitem on a chained copy.
 
         If you call this function, be sure to set the stacklevel such that the
-        user will see the error *at the level of setting*"""
+        user will see the error *at the level of setting*
+
+        It is technically possible to figure out that we are setting on
+        a copy even WITH a multi-dtyped pandas object. In other words, some blocks
+        may be views while other are not. Currently _is_view will ALWAYS return False
+        for multi-blocks to avoid having to handle this case.
+
+        df = DataFrame(np.arange(0,9), columns=['count'])
+        df['group'] = 'b'
+
+        # this technically need not raise SettingWithCopy if both are view (which is not
+        # generally guaranteed but is usually True
+        # however, this is in general not a good practice and we recommend using .loc
+        df.iloc[0:5]['group'] = 'a'
+
+        """
+
         if self.is_copy:
 
             value = config.get_option('mode.chained_assignment')
@@ -1170,14 +1194,18 @@ def _check_setitem_copy(self, stacklevel=4, t='setting'):
                 pass
 
             if t == 'referant':
-                t = ("A value is trying to be set on a copy of a slice from a "
+                t = ("\n"
+                     "A value is trying to be set on a copy of a slice from a "
                      "DataFrame\n\n"
-                     "See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy")
+                     "See the the caveats in the documentation: "
+                     "http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy")
             else:
-                t = ("A value is trying to be set on a copy of a slice from a "
-                     "DataFrame.\nTry using .loc[row_index,col_indexer] = value "
-                     "instead\n\n"
-                     "See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy")
+                t = ("\n"
+                     "A value is trying to be set on a copy of a slice from a "
+                     "DataFrame.\n"
+                     "Try using .loc[row_indexer,col_indexer] = value instead\n\n"
+                     "See the the caveats in the documentation: "
+                     "http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy")
 
             if value == 'raise':
                 raise SettingWithCopyError(t)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -25,7 +25,7 @@
                                notnull, _DATELIKE_DTYPES, is_numeric_dtype,
                                is_timedelta64_dtype, is_datetime64_dtype,
                                is_categorical_dtype)
-
+from pandas.core.config import option_context
 from pandas import _np_version_under1p7
 import pandas.lib as lib
 from pandas.lib import Timestamp
@@ -635,7 +635,9 @@ def apply(self, func, *args, **kwargs):
 
         @wraps(func)
         def f(g):
-            return func(g, *args, **kwargs)
+            # ignore SettingWithCopy here in case the user mutates
+            with option_context('mode.chained_assignment',None):
+                return func(g, *args, **kwargs)
 
         return self._python_apply_general(f)
 
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -2519,6 +2519,14 @@ def is_view(self):
         """ return a boolean if we are a single block and are a view """
         if len(self.blocks) == 1:
             return self.blocks[0].values.base is not None
+
+        # It is technically possible to figure out which blocks are views
+        # e.g. [ b.values.base is not None for b in self.blocks ]
+        # but then we have the case of possibly some blocks being a view
+        # and some blocks not. setting in theory is possible on the non-view
+        # blocks w/o causing a SettingWithCopy raise/warn. But this is a bit
+        # complicated
+
         return False
 
     def get_bool_data(self, copy=False):
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -418,8 +418,12 @@ def test_setitem(self):
         self.frame['col8'] = 'foo'
         assert((self.frame['col8'] == 'foo').all())
 
+        # this is partially a view (e.g. some blocks are view)
+        # so raise/warn
         smaller = self.frame[:2]
-        smaller['col10'] = ['1', '2']
+        def f():
+            smaller['col10'] = ['1', '2']
+        self.assertRaises(com.SettingWithCopyError, f)
         self.assertEqual(smaller['col10'].dtype, np.object_)
         self.assertTrue((smaller['col10'] == ['1', '2']).all())
 
@@ -830,8 +834,11 @@ def test_fancy_getitem_slice_mixed(self):
         self.assertEqual(sliced['D'].dtype, np.float64)
 
         # get view with single block
+        # setting it triggers setting with copy
         sliced = self.frame.ix[:, -3:]
-        sliced['C'] = 4.
+        def f():
+            sliced['C'] = 4.
+        self.assertRaises(com.SettingWithCopyError, f)
         self.assertTrue((self.frame['C'] == 4).all())
 
     def test_fancy_setitem_int_labels(self):
@@ -1618,7 +1625,10 @@ def test_irow(self):
         assert_frame_equal(result, expected)
 
         # verify slice is view
-        result[2] = 0.
+        # setting it makes it raise/warn
+        def f():
+            result[2] = 0.
+        self.assertRaises(com.SettingWithCopyError, f)
         exp_col = df[2].copy()
         exp_col[4:8] = 0.
         assert_series_equal(df[2], exp_col)
@@ -1645,7 +1655,10 @@ def test_icol(self):
         assert_frame_equal(result, expected)
 
         # verify slice is view
-        result[8] = 0.
+        # and that we are setting a copy
+        def f():
+            result[8] = 0.
+        self.assertRaises(com.SettingWithCopyError, f)
         self.assertTrue((df[8] == 0).all())
 
         # list of integers
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -13,6 +13,7 @@
 from pandas.core.groupby import (SpecificationError, DataError,
                                  _nargsort, _lexsort_indexer)
 from pandas.core.series import Series
+from pandas.core.config import option_context
 from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
                                  assert_series_equal, assert_almost_equal,
                                  assert_index_equal, assertRaisesRegexp)
@@ -2299,9 +2300,11 @@ def f(group):
 
         self.assertEqual(result['d'].dtype, np.float64)
 
-        for key, group in grouped:
-            res = f(group)
-            assert_frame_equal(res, result.ix[key])
+        # this is by definition a mutating operation!
+        with option_context('mode.chained_assignment',None):
+            for key, group in grouped:
+                res = f(group)
+                assert_frame_equal(res, result.ix[key])
 
     def test_groupby_wrong_multi_labels(self):
         from pandas import read_csv
diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
@@ -3246,6 +3246,13 @@ def f():
         df['column1'] = df['column1'] + 'c'
         str(df)
 
+        # from SO: http://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc
+        df = DataFrame(np.arange(0,9), columns=['count'])
+        df['group'] = 'b'
+        def f():
+            df.iloc[0:5]['group'] = 'a'
+        self.assertRaises(com.SettingWithCopyError, f)
+
     def test_float64index_slicing_bug(self):
         # GH 5557, related to slicing a float index
         ser = {256: 2321.0, 1: 78.0, 2: 2716.0, 3: 0.0, 4: 369.0, 5: 0.0, 6: 269.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 3536.0, 11: 0.0, 12: 24.0, 13: 0.0, 14: 931.0, 15: 0.0, 16: 101.0, 17: 78.0, 18: 9643.0, 19: 0.0, 20: 0.0, 21: 0.0, 22: 63761.0, 23: 0.0, 24: 446.0, 25: 0.0, 26: 34773.0, 27: 0.0, 28: 729.0, 29: 78.0, 30: 0.0, 31: 0.0, 32: 3374.0, 33: 0.0, 34: 1391.0, 35: 0.0, 36: 361.0, 37: 0.0, 38: 61808.0, 39: 0.0, 40: 0.0, 41: 0.0, 42: 6677.0, 43: 0.0, 44: 802.0, 45: 0.0, 46: 2691.0, 47: 0.0, 48: 3582.0, 49: 0.0, 50: 734.0, 51: 0.0, 52: 627.0, 53: 70.0, 54: 2584.0, 55: 0.0, 56: 324.0, 57: 0.0, 58: 605.0, 59: 0.0, 60: 0.0, 61: 0.0, 62: 3989.0, 63: 10.0, 64: 42.0, 65: 0.0, 66: 904.0, 67: 0.0, 68: 88.0, 69: 70.0, 70: 8172.0, 71: 0.0, 72: 0.0, 73: 0.0, 74: 64902.0, 75: 0.0, 76: 347.0, 77: 0.0, 78: 36605.0, 79: 0.0, 80: 379.0, 81: 70.0, 82: 0.0, 83: 0.0, 84: 3001.0, 85: 0.0, 86: 1630.0, 87: 7.0, 88: 364.0, 89: 0.0, 90: 67404.0, 91: 9.0, 92: 0.0, 93: 0.0, 94: 7685.0, 95: 0.0, 96: 1017.0, 97: 0.0, 98: 2831.0, 99: 0.0, 100: 2963.0, 101: 0.0, 102: 854.0, 103: 0.0, 104: 0.0, 105: 0.0, 106: 0.0, 107: 0.0, 108: 0.0, 109: 0.0, 110: 0.0, 111: 0.0, 112: 0.0, 113: 0.0, 114: 0.0, 115: 0.0, 116: 0.0, 117: 0.0, 118: 0.0, 119: 0.0, 120: 0.0, 121: 0.0, 122: 0.0, 123: 0.0, 124: 0.0, 125: 0.0, 126: 67744.0, 127: 22.0, 128: 264.0, 129: 0.0, 260: 197.0, 268: 0.0, 265: 0.0, 269: 0.0, 261: 0.0, 266: 1198.0, 267: 0.0, 262: 2629.0, 258: 775.0, 257: 0.0, 263: 0.0, 259: 0.0, 264: 163.0, 250: 10326.0, 251: 0.0, 252: 1228.0, 253: 0.0, 254: 2769.0, 255: 0.0}
diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py
@@ -228,9 +228,14 @@ def _all_key(key):
         if len(rows) > 0:
             margin = data[rows + values].groupby(rows).agg(aggfunc)
             cat_axis = 1
+
             for key, piece in table.groupby(level=0, axis=cat_axis):
                 all_key = _all_key(key)
+
+                # we are going to mutate this, so need to copy!
+                piece = piece.copy()
                 piece[all_key] = margin[key]
+
                 table_pieces.append(piece)
                 margin_keys.append(all_key)
         else:
diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py
@@ -1320,6 +1320,7 @@ def test_append_many(self):
         result = chunks[0].append(chunks[1:])
         tm.assert_frame_equal(result, self.frame)
 
+        chunks[-1] = chunks[-1].copy()
         chunks[-1]['foo'] = 'bar'
         result = chunks[0].append(chunks[1:])
         tm.assert_frame_equal(result.ix[:, self.frame.columns], self.frame)
@@ -1673,7 +1674,7 @@ def test_join_dups(self):
     def test_handle_empty_objects(self):
         df = DataFrame(np.random.randn(10, 4), columns=list('abcd'))
 
-        baz = df[:5]
+        baz = df[:5].copy()
         baz['foo'] = 'bar'
         empty = df[5:5]