Skip to content

Commit b88d014

Browse files
committed
Merge pull request #7845 from jreback/setting_with_copy
COMPAT: SettingWithCopy will now warn when slices which can generate views are then set
2 parents 6a362d1 + 048481b commit b88d014

File tree

10 files changed

+105
-23
lines changed

10 files changed

+105
-23
lines changed

doc/source/v0.15.0.txt

+16
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,22 @@ API changes
9191
df
9292
df.dtypes
9393

94+
- ``SettingWithCopy`` raise/warnings (according to the option ``mode.chained_assignment``) will now be issued when setting a value on a sliced mixed-dtype DataFrame using chained-assignment. (:issue:`7845`)
95+
96+
.. code-block:: python
97+
98+
In [1]: df = DataFrame(np.arange(0,9), columns=['count'])
99+
100+
In [2]: df['group'] = 'b'
101+
102+
In [3]: df.iloc[0:5]['group'] = 'a'
103+
/usr/local/bin/ipython:1: SettingWithCopyWarning:
104+
A value is trying to be set on a copy of a slice from a DataFrame.
105+
Try using .loc[row_indexer,col_indexer] = value instead
106+
107+
See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
108+
109+
94110
.. _whatsnew_0150.cat:
95111

96112
Categoricals in Series/DataFrame

pandas/core/frame.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -2075,21 +2075,20 @@ def _set_item(self, key, value):
20752075
Add series to DataFrame in specified column.
20762076
20772077
If series is a numpy-array (not a Series/TimeSeries), it must be the
2078-
same length as the DataFrame's index or an error will be thrown.
2078+
same length as the DataFrames index or an error will be thrown.
20792079
2080-
Series/TimeSeries will be conformed to the DataFrame's index to
2080+
Series/TimeSeries will be conformed to the DataFrames index to
20812081
ensure homogeneity.
20822082
"""
20832083

2084-
is_existing = key in self.columns
20852084
self._ensure_valid_index(value)
20862085
value = self._sanitize_column(key, value)
20872086
NDFrame._set_item(self, key, value)
20882087

20892088
# check if we are modifying a copy
20902089
# try to set first as we want an invalid
20912090
# value exeption to occur first
2092-
if is_existing:
2091+
if len(self):
20932092
self._check_setitem_copy()
20942093

20952094
def insert(self, loc, column, value, allow_duplicates=False):

pandas/core/generic.py

+37-9
Original file line numberDiff line numberDiff line change
@@ -1133,7 +1133,13 @@ def _slice(self, slobj, axis=0, typ=None):
11331133
11341134
"""
11351135
axis = self._get_block_manager_axis(axis)
1136-
return self._constructor(self._data.get_slice(slobj, axis=axis))
1136+
result = self._constructor(self._data.get_slice(slobj, axis=axis))
1137+
1138+
# this could be a view
1139+
# but only in a single-dtyped view slicable case
1140+
is_copy = axis!=0 or result._is_view
1141+
result._set_is_copy(self, copy=is_copy)
1142+
return result
11371143

11381144
def _set_item(self, key, value):
11391145
self._data.set(key, value)
@@ -1149,10 +1155,28 @@ def _set_is_copy(self, ref=None, copy=True):
11491155
self.is_copy = None
11501156

11511157
def _check_setitem_copy(self, stacklevel=4, t='setting'):
1152-
""" validate if we are doing a settitem on a chained copy.
1158+
"""
1159+
1160+
validate if we are doing a settitem on a chained copy.
11531161
11541162
If you call this function, be sure to set the stacklevel such that the
1155-
user will see the error *at the level of setting*"""
1163+
user will see the error *at the level of setting*
1164+
1165+
It is technically possible to figure out that we are setting on
1166+
a copy even WITH a multi-dtyped pandas object. In other words, some blocks
1167+
may be views while other are not. Currently _is_view will ALWAYS return False
1168+
for multi-blocks to avoid having to handle this case.
1169+
1170+
df = DataFrame(np.arange(0,9), columns=['count'])
1171+
df['group'] = 'b'
1172+
1173+
# this technically need not raise SettingWithCopy if both are view (which is not
1174+
# generally guaranteed but is usually True
1175+
# however, this is in general not a good practice and we recommend using .loc
1176+
df.iloc[0:5]['group'] = 'a'
1177+
1178+
"""
1179+
11561180
if self.is_copy:
11571181

11581182
value = config.get_option('mode.chained_assignment')
@@ -1170,14 +1194,18 @@ def _check_setitem_copy(self, stacklevel=4, t='setting'):
11701194
pass
11711195

11721196
if t == 'referant':
1173-
t = ("A value is trying to be set on a copy of a slice from a "
1197+
t = ("\n"
1198+
"A value is trying to be set on a copy of a slice from a "
11741199
"DataFrame\n\n"
1175-
"See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy")
1200+
"See the the caveats in the documentation: "
1201+
"http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy")
11761202
else:
1177-
t = ("A value is trying to be set on a copy of a slice from a "
1178-
"DataFrame.\nTry using .loc[row_index,col_indexer] = value "
1179-
"instead\n\n"
1180-
"See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy")
1203+
t = ("\n"
1204+
"A value is trying to be set on a copy of a slice from a "
1205+
"DataFrame.\n"
1206+
"Try using .loc[row_indexer,col_indexer] = value instead\n\n"
1207+
"See the the caveats in the documentation: "
1208+
"http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy")
11811209

11821210
if value == 'raise':
11831211
raise SettingWithCopyError(t)

pandas/core/groupby.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
notnull, _DATELIKE_DTYPES, is_numeric_dtype,
2626
is_timedelta64_dtype, is_datetime64_dtype,
2727
is_categorical_dtype)
28-
28+
from pandas.core.config import option_context
2929
from pandas import _np_version_under1p7
3030
import pandas.lib as lib
3131
from pandas.lib import Timestamp
@@ -635,7 +635,9 @@ def apply(self, func, *args, **kwargs):
635635

636636
@wraps(func)
637637
def f(g):
638-
return func(g, *args, **kwargs)
638+
# ignore SettingWithCopy here in case the user mutates
639+
with option_context('mode.chained_assignment',None):
640+
return func(g, *args, **kwargs)
639641

640642
return self._python_apply_general(f)
641643

pandas/core/internals.py

+8
Original file line numberDiff line numberDiff line change
@@ -2519,6 +2519,14 @@ def is_view(self):
25192519
""" return a boolean if we are a single block and are a view """
25202520
if len(self.blocks) == 1:
25212521
return self.blocks[0].values.base is not None
2522+
2523+
# It is technically possible to figure out which blocks are views
2524+
# e.g. [ b.values.base is not None for b in self.blocks ]
2525+
# but then we have the case of possibly some blocks being a view
2526+
# and some blocks not. setting in theory is possible on the non-view
2527+
# blocks w/o causing a SettingWithCopy raise/warn. But this is a bit
2528+
# complicated
2529+
25222530
return False
25232531

25242532
def get_bool_data(self, copy=False):

pandas/tests/test_frame.py

+17-4
Original file line numberDiff line numberDiff line change
@@ -418,8 +418,12 @@ def test_setitem(self):
418418
self.frame['col8'] = 'foo'
419419
assert((self.frame['col8'] == 'foo').all())
420420

421+
# this is partially a view (e.g. some blocks are view)
422+
# so raise/warn
421423
smaller = self.frame[:2]
422-
smaller['col10'] = ['1', '2']
424+
def f():
425+
smaller['col10'] = ['1', '2']
426+
self.assertRaises(com.SettingWithCopyError, f)
423427
self.assertEqual(smaller['col10'].dtype, np.object_)
424428
self.assertTrue((smaller['col10'] == ['1', '2']).all())
425429

@@ -830,8 +834,11 @@ def test_fancy_getitem_slice_mixed(self):
830834
self.assertEqual(sliced['D'].dtype, np.float64)
831835

832836
# get view with single block
837+
# setting it triggers setting with copy
833838
sliced = self.frame.ix[:, -3:]
834-
sliced['C'] = 4.
839+
def f():
840+
sliced['C'] = 4.
841+
self.assertRaises(com.SettingWithCopyError, f)
835842
self.assertTrue((self.frame['C'] == 4).all())
836843

837844
def test_fancy_setitem_int_labels(self):
@@ -1618,7 +1625,10 @@ def test_irow(self):
16181625
assert_frame_equal(result, expected)
16191626

16201627
# verify slice is view
1621-
result[2] = 0.
1628+
# setting it makes it raise/warn
1629+
def f():
1630+
result[2] = 0.
1631+
self.assertRaises(com.SettingWithCopyError, f)
16221632
exp_col = df[2].copy()
16231633
exp_col[4:8] = 0.
16241634
assert_series_equal(df[2], exp_col)
@@ -1645,7 +1655,10 @@ def test_icol(self):
16451655
assert_frame_equal(result, expected)
16461656

16471657
# verify slice is view
1648-
result[8] = 0.
1658+
# and that we are setting a copy
1659+
def f():
1660+
result[8] = 0.
1661+
self.assertRaises(com.SettingWithCopyError, f)
16491662
self.assertTrue((df[8] == 0).all())
16501663

16511664
# list of integers

pandas/tests/test_groupby.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from pandas.core.groupby import (SpecificationError, DataError,
1414
_nargsort, _lexsort_indexer)
1515
from pandas.core.series import Series
16+
from pandas.core.config import option_context
1617
from pandas.util.testing import (assert_panel_equal, assert_frame_equal,
1718
assert_series_equal, assert_almost_equal,
1819
assert_index_equal, assertRaisesRegexp)
@@ -2299,9 +2300,11 @@ def f(group):
22992300

23002301
self.assertEqual(result['d'].dtype, np.float64)
23012302

2302-
for key, group in grouped:
2303-
res = f(group)
2304-
assert_frame_equal(res, result.ix[key])
2303+
# this is by definition a mutating operation!
2304+
with option_context('mode.chained_assignment',None):
2305+
for key, group in grouped:
2306+
res = f(group)
2307+
assert_frame_equal(res, result.ix[key])
23052308

23062309
def test_groupby_wrong_multi_labels(self):
23072310
from pandas import read_csv

pandas/tests/test_indexing.py

+7
Original file line numberDiff line numberDiff line change
@@ -3246,6 +3246,13 @@ def f():
32463246
df['column1'] = df['column1'] + 'c'
32473247
str(df)
32483248

3249+
# from SO: http://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc
3250+
df = DataFrame(np.arange(0,9), columns=['count'])
3251+
df['group'] = 'b'
3252+
def f():
3253+
df.iloc[0:5]['group'] = 'a'
3254+
self.assertRaises(com.SettingWithCopyError, f)
3255+
32493256
def test_float64index_slicing_bug(self):
32503257
# GH 5557, related to slicing a float index
32513258
ser = {256: 2321.0, 1: 78.0, 2: 2716.0, 3: 0.0, 4: 369.0, 5: 0.0, 6: 269.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 3536.0, 11: 0.0, 12: 24.0, 13: 0.0, 14: 931.0, 15: 0.0, 16: 101.0, 17: 78.0, 18: 9643.0, 19: 0.0, 20: 0.0, 21: 0.0, 22: 63761.0, 23: 0.0, 24: 446.0, 25: 0.0, 26: 34773.0, 27: 0.0, 28: 729.0, 29: 78.0, 30: 0.0, 31: 0.0, 32: 3374.0, 33: 0.0, 34: 1391.0, 35: 0.0, 36: 361.0, 37: 0.0, 38: 61808.0, 39: 0.0, 40: 0.0, 41: 0.0, 42: 6677.0, 43: 0.0, 44: 802.0, 45: 0.0, 46: 2691.0, 47: 0.0, 48: 3582.0, 49: 0.0, 50: 734.0, 51: 0.0, 52: 627.0, 53: 70.0, 54: 2584.0, 55: 0.0, 56: 324.0, 57: 0.0, 58: 605.0, 59: 0.0, 60: 0.0, 61: 0.0, 62: 3989.0, 63: 10.0, 64: 42.0, 65: 0.0, 66: 904.0, 67: 0.0, 68: 88.0, 69: 70.0, 70: 8172.0, 71: 0.0, 72: 0.0, 73: 0.0, 74: 64902.0, 75: 0.0, 76: 347.0, 77: 0.0, 78: 36605.0, 79: 0.0, 80: 379.0, 81: 70.0, 82: 0.0, 83: 0.0, 84: 3001.0, 85: 0.0, 86: 1630.0, 87: 7.0, 88: 364.0, 89: 0.0, 90: 67404.0, 91: 9.0, 92: 0.0, 93: 0.0, 94: 7685.0, 95: 0.0, 96: 1017.0, 97: 0.0, 98: 2831.0, 99: 0.0, 100: 2963.0, 101: 0.0, 102: 854.0, 103: 0.0, 104: 0.0, 105: 0.0, 106: 0.0, 107: 0.0, 108: 0.0, 109: 0.0, 110: 0.0, 111: 0.0, 112: 0.0, 113: 0.0, 114: 0.0, 115: 0.0, 116: 0.0, 117: 0.0, 118: 0.0, 119: 0.0, 120: 0.0, 121: 0.0, 122: 0.0, 123: 0.0, 124: 0.0, 125: 0.0, 126: 67744.0, 127: 22.0, 128: 264.0, 129: 0.0, 260: 197.0, 268: 0.0, 265: 0.0, 269: 0.0, 261: 0.0, 266: 1198.0, 267: 0.0, 262: 2629.0, 258: 775.0, 257: 0.0, 263: 0.0, 259: 0.0, 264: 163.0, 250: 10326.0, 251: 0.0, 252: 1228.0, 253: 0.0, 254: 2769.0, 255: 0.0}

pandas/tools/pivot.py

+5
Original file line numberDiff line numberDiff line change
@@ -228,9 +228,14 @@ def _all_key(key):
228228
if len(rows) > 0:
229229
margin = data[rows + values].groupby(rows).agg(aggfunc)
230230
cat_axis = 1
231+
231232
for key, piece in table.groupby(level=0, axis=cat_axis):
232233
all_key = _all_key(key)
234+
235+
# we are going to mutate this, so need to copy!
236+
piece = piece.copy()
233237
piece[all_key] = margin[key]
238+
234239
table_pieces.append(piece)
235240
margin_keys.append(all_key)
236241
else:

pandas/tools/tests/test_merge.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1320,6 +1320,7 @@ def test_append_many(self):
13201320
result = chunks[0].append(chunks[1:])
13211321
tm.assert_frame_equal(result, self.frame)
13221322

1323+
chunks[-1] = chunks[-1].copy()
13231324
chunks[-1]['foo'] = 'bar'
13241325
result = chunks[0].append(chunks[1:])
13251326
tm.assert_frame_equal(result.ix[:, self.frame.columns], self.frame)
@@ -1673,7 +1674,7 @@ def test_join_dups(self):
16731674
def test_handle_empty_objects(self):
16741675
df = DataFrame(np.random.randn(10, 4), columns=list('abcd'))
16751676

1676-
baz = df[:5]
1677+
baz = df[:5].copy()
16771678
baz['foo'] = 'bar'
16781679
empty = df[5:5]
16791680

0 commit comments

Comments
 (0)