From f35187a8b7b6edb679d1f4a81e65032736d3ecbd Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Wed, 30 Sep 2015 09:55:43 -0700 Subject: [PATCH 1/9] Basic working implementation --- pandas/core/frame.py | 9 --- pandas/core/generic.py | 123 +++++++---------------------------- pandas/core/indexing.py | 6 +- pandas/core/series.py | 3 - pandas/tests/test_generic.py | 80 +++++++++++++++++++++++ 5 files changed, 108 insertions(+), 113 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 15a840ff3c7ba..d9dba74470d71 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2299,7 +2299,6 @@ def __setitem__(self, key, value): self._set_item(key, value) def _setitem_slice(self, key, value): - self._check_setitem_copy() self.ix._setitem_with_indexer(key, value) def _setitem_array(self, key, value): @@ -2310,7 +2309,6 @@ def _setitem_array(self, key, value): (len(key), len(self.index))) key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] - self._check_setitem_copy() self.ix._setitem_with_indexer(indexer, value) else: if isinstance(value, DataFrame): @@ -2320,7 +2318,6 @@ def _setitem_array(self, key, value): self[k1] = value[k2] else: indexer = self.ix._convert_to_indexer(key, axis=1) - self._check_setitem_copy() self.ix._setitem_with_indexer((slice(None), indexer), value) def _setitem_frame(self, key, value): @@ -2330,7 +2327,6 @@ def _setitem_frame(self, key, value): raise TypeError('Must pass DataFrame with boolean values only') self._check_inplace_setting(value) - self._check_setitem_copy() self.where(-key, value, inplace=True) def _ensure_valid_index(self, value): @@ -2366,11 +2362,6 @@ def _set_item(self, key, value): value = self._sanitize_column(key, value) NDFrame._set_item(self, key, value) - # check if we are modifying a copy - # try to set first as we want an invalid - # value exeption to occur first - if len(self): - self._check_setitem_copy() def insert(self, loc, column, value, allow_duplicates=False): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3df81481f1e84..5ced0d237b9d4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -84,7 +84,7 @@ class NDFrame(PandasObject): _internal_names = ['_data', '_cacher', '_item_cache', '_cache', 'is_copy', '_subtyp', '_index', '_default_kind', '_default_fill_value', '_metadata', - '__array_struct__', '__array_interface__'] + '__array_struct__', '__array_interface__', '_children'] _internal_names_set = set(_internal_names) _accessors = frozenset([]) _metadata = [] @@ -106,6 +106,8 @@ def __init__(self, data, axes=None, copy=False, dtype=None, object.__setattr__(self, 'is_copy', None) object.__setattr__(self, '_data', data) object.__setattr__(self, '_item_cache', {}) + object.__setattr__(self, '_children', []) + def _validate_dtype(self, dtype): """ validate the passed dtype """ @@ -1175,9 +1177,6 @@ def _maybe_update_cacher(self, clear=False, verify_is_copy=True): except: pass - if verify_is_copy: - self._check_setitem_copy(stacklevel=5, t='referant') - if clear: self._clear_item_cache() @@ -1202,9 +1201,16 @@ def _slice(self, slobj, axis=0, kind=None): # but only in a single-dtyped view slicable case is_copy = axis!=0 or result._is_view result._set_is_copy(self, copy=is_copy) + + self._children.append(weakref.ref(result)) + return result def _set_item(self, key, value): + + # If children are views, reset to copies before setting. + self._convert_views_to_copies() + self._data.set(key, value) self._clear_item_cache() @@ -1217,103 +1223,21 @@ def _set_is_copy(self, ref=None, copy=True): else: self.is_copy = None - def _check_is_chained_assignment_possible(self): - """ - check if we are a view, have a cacher, and are of mixed type - if so, then force a setitem_copy check - - should be called just near setting a value - - will return a boolean if it we are a view and are cached, but a single-dtype - meaning that the cacher should be updated following setting - """ - if self._is_view and self._is_cached: - ref = self._get_cacher() - if ref is not None and ref._is_mixed_type: - self._check_setitem_copy(stacklevel=4, t='referant', force=True) - return True - elif self.is_copy: - self._check_setitem_copy(stacklevel=4, t='referant') - return False - - def _check_setitem_copy(self, stacklevel=4, t='setting', force=False): - """ - - Parameters - ---------- - stacklevel : integer, default 4 - the level to show of the stack when the error is output - t : string, the type of setting error - force : boolean, default False - if True, then force showing an error - - validate if we are doing a settitem on a chained copy. - - If you call this function, be sure to set the stacklevel such that the - user will see the error *at the level of setting* - - It is technically possible to figure out that we are setting on - a copy even WITH a multi-dtyped pandas object. In other words, some blocks - may be views while other are not. Currently _is_view will ALWAYS return False - for multi-blocks to avoid having to handle this case. + def _convert_views_to_copies(self): + # Don't set on views. + if self._is_view: + self._data = self._data.copy() - df = DataFrame(np.arange(0,9), columns=['count']) - df['group'] = 'b' + # Before setting values, make sure children converted to copies. + for child in self._children: - # this technically need not raise SettingWithCopy if both are view (which is not - # generally guaranteed but is usually True - # however, this is in general not a good practice and we recommend using .loc - df.iloc[0:5]['group'] = 'a' - - """ - - if force or self.is_copy: - - value = config.get_option('mode.chained_assignment') - if value is None: - return - - # see if the copy is not actually refererd; if so, then disolve - # the copy weakref - try: - gc.collect(2) - if not gc.get_referents(self.is_copy()): - self.is_copy = None - return - except: - pass - - # we might be a false positive - try: - if self.is_copy().shape == self.shape: - self.is_copy = None - return - except: - pass - - # a custom message - if isinstance(self.is_copy, string_types): - t = self.is_copy - - elif t == 'referant': - t = ("\n" - "A value is trying to be set on a copy of a slice from a " - "DataFrame\n\n" - "See the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy") - - else: - t = ("\n" - "A value is trying to be set on a copy of a slice from a " - "DataFrame.\n" - "Try using .loc[row_indexer,col_indexer] = value instead\n\n" - "See the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy") - - if value == 'raise': - raise SettingWithCopyError(t) - elif value == 'warn': - warnings.warn(t, SettingWithCopyWarning, stacklevel=stacklevel) + # Make sure children of children converted. + child()._convert_views_to_copies() + + if child()._is_view: + child()._data = child()._data.copy() + + self._children=[] def __delitem__(self, key): """ @@ -2366,6 +2290,7 @@ def __setattr__(self, name, value): # e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify # the same attribute. + try: object.__getattribute__(self, name) return object.__setattr__(self, name, value) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 2b1cb0a1e1b31..99a12d2c07de6 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -113,6 +113,9 @@ def _get_setitem_indexer(self, key): raise IndexingError(key) def __setitem__(self, key, value): + # Make sure changes don't propagate to children + self.obj._convert_views_to_copies() + indexer = self._get_setitem_indexer(key) self._setitem_with_indexer(indexer, value) @@ -205,6 +208,7 @@ def _has_valid_positional_setitem_indexer(self, indexer): def _setitem_with_indexer(self, indexer, value): self._has_valid_setitem_indexer(indexer) + # also has the side effect of consolidating in-place from pandas import Panel, DataFrame, Series info_axis = self.obj._info_axis_number @@ -517,8 +521,6 @@ def can_do_equal_len(): if isinstance(value, ABCPanel): value = self._align_panel(indexer, value) - # check for chained assignment - self.obj._check_is_chained_assignment_possible() # actually do the set self.obj._consolidate_inplace() diff --git a/pandas/core/series.py b/pandas/core/series.py index e603c6aa75d6f..0c5b8f7139b5d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -720,10 +720,7 @@ def setitem(key, value): self._set_with(key, value) # do the setitem - cacher_needs_updating = self._check_is_chained_assignment_possible() setitem(key, value) - if cacher_needs_updating: - self._maybe_update_cacher() def _set_with_engine(self, key, value): values = self._values diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 478c65892173d..7c9254325cabf 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -1715,6 +1715,86 @@ def test_pct_change(self): self.assert_frame_equal(result, expected) + def test_copy_on_write(self): + + ####### + # FORWARD PROPAGATION TESTS + ####### + + ## + # Test children recorded from various slicing methods + ## + + df = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) + self.assertTrue(len(df._children)==0) + + + views = dict() + + views['loc'] = df.loc[0:0,] + views['iloc'] = df.iloc[0:1,] + views['ix'] = df.ix[0:0,] + views['loc_of_loc'] = views['loc'].loc[0:0,] + + copies = dict() + for v in views.keys(): + self.assertTrue(views[v]._is_view) + copies[v] = views[v].copy() + + + + df.loc[0,'col1'] = -88 + + for v in views.keys(): + tm.assert_frame_equal(views[v], copies[v]) + self.assertFalse(views[v]._is_view) + + ## + # Test views become copies + # during different forms of value setting. + ## + + parent = dict() + views = dict() + copies = dict() + for v in ['loc', 'iloc', 'ix', 'column', 'attribute']: + parent[v] = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) + views[v] = parent[v].loc[0:0,] + copies[v] = views[v].copy() + self.assertTrue( views[v]._is_view ) + + parent['loc'].loc[0, 'col1'] = -88 + parent['iloc'].iloc[0, 0] = -88 + parent['ix'].ix[0, 'col1'] = -88 + parent['column']['col1'] = -88 + parent['attribute'].col1 = -88 + + + for v in views.keys(): + tm.assert_frame_equal(views[v], copies[v]) + self.assertFalse(views[v]._is_view) + + + + ######## + # No Backward Propogation + ####### + df = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) + df_copy = df.copy() + + views = dict() + + views['loc'] = df.loc[0:0,] + views['iloc'] = df.iloc[0:1,] + views['ix'] = df.ix[0:0,] + views['loc_of_loc'] = views['loc'].loc[0:0,] + + for v in views.keys(): + views[v].loc[0:0,] = -99 + + tm.assert_frame_equal(df, df_copy) + + class TestPanel(tm.TestCase, Generic): _typ = Panel _comparator = lambda self, x, y: assert_panel_equal(x, y) From 90ff5ea422c8d6acf619e4e2b9a767b854cec03a Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Thu, 29 Oct 2015 19:41:37 -0700 Subject: [PATCH 2/9] now keeps columns as views --- pandas/core/frame.py | 4 +++- pandas/core/generic.py | 17 +++++++++++++---- pandas/core/indexing.py | 1 + pandas/tests/test_generic.py | 37 +++++++++++++++++++++++++++--------- 4 files changed, 45 insertions(+), 14 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d9dba74470d71..4fe1ac455f99b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1949,7 +1949,9 @@ def __getitem__(self, key): is_mi_columns = isinstance(self.columns, MultiIndex) try: if key in self.columns and not is_mi_columns: - return self._getitem_column(key) + result = self._getitem_column(key) + result._is_column_view = True + return result except: pass diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5ced0d237b9d4..c792ff086a9ba 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -84,7 +84,8 @@ class NDFrame(PandasObject): _internal_names = ['_data', '_cacher', '_item_cache', '_cache', 'is_copy', '_subtyp', '_index', '_default_kind', '_default_fill_value', '_metadata', - '__array_struct__', '__array_interface__', '_children'] + '__array_struct__', '__array_interface__', '_children', + '_is_column_view'] _internal_names_set = set(_internal_names) _accessors = frozenset([]) _metadata = [] @@ -107,6 +108,7 @@ def __init__(self, data, axes=None, copy=False, dtype=None, object.__setattr__(self, '_data', data) object.__setattr__(self, '_item_cache', {}) object.__setattr__(self, '_children', []) + object.__setattr__(self, '_is_column_view', False) def _validate_dtype(self, dtype): @@ -1077,13 +1079,19 @@ def get(self, key, default=None): ------- value : type of items contained in object """ + try: return self[key] except (KeyError, ValueError, IndexError): return default def __getitem__(self, item): - return self._get_item_cache(item) + result = self._get_item_cache(item) + + if isinstance(item, str): + result._is_column_view = True + + return result def _get_item_cache(self, item): """ return the cached item, item represents a label indexer """ @@ -1225,7 +1233,7 @@ def _set_is_copy(self, ref=None, copy=True): def _convert_views_to_copies(self): # Don't set on views. - if self._is_view: + if self._is_view and not self._is_column_view: self._data = self._data.copy() # Before setting values, make sure children converted to copies. @@ -1234,7 +1242,7 @@ def _convert_views_to_copies(self): # Make sure children of children converted. child()._convert_views_to_copies() - if child()._is_view: + if child()._is_view and not self._is_column_view: child()._data = child()._data.copy() self._children=[] @@ -2267,6 +2275,7 @@ def __finalize__(self, other, method=None, **kwargs): return self def __getattr__(self, name): + """After regular attribute access, try looking up the name This allows simpler access to columns for interactive use. """ diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 99a12d2c07de6..c7c325f9739d0 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -57,6 +57,7 @@ def __iter__(self): raise NotImplementedError('ix is not iterable') def __getitem__(self, key): + if type(key) is tuple: try: values = self.obj.get_value(*key) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 7c9254325cabf..c4d9c6d4a2dfc 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -1721,9 +1721,7 @@ def test_copy_on_write(self): # FORWARD PROPAGATION TESTS ####### - ## - # Test children recorded from various slicing methods - ## + # Test various slicing methods add to _children df = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) self.assertTrue(len(df._children)==0) @@ -1749,10 +1747,8 @@ def test_copy_on_write(self): tm.assert_frame_equal(views[v], copies[v]) self.assertFalse(views[v]._is_view) - ## - # Test views become copies - # during different forms of value setting. - ## + # Test different forms of value setting + # all trigger conversions parent = dict() views = dict() @@ -1773,8 +1769,6 @@ def test_copy_on_write(self): for v in views.keys(): tm.assert_frame_equal(views[v], copies[v]) self.assertFalse(views[v]._is_view) - - ######## # No Backward Propogation @@ -1794,6 +1788,31 @@ def test_copy_on_write(self): tm.assert_frame_equal(df, df_copy) + ### + # Dictionary-like access to single columns SHOULD give views + ### + + # If change child, should back-propagate + df = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) + v = df['col1'] + self.assertTrue(v._is_view) + self.assertTrue(v._is_column_view) + v.loc[0]=-88 + self.assertTrue(df.loc[0,'col1'] == -88) + self.assertTrue(v._is_view) + + # If change parent, should forward-propagate + df = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) + v = df['col1'] + self.assertTrue(v._is_view) + self.assertTrue(v._is_column_view) + df.loc[0, 'col1']=-88 + self.assertTrue(v.loc[0] == -88) + self.assertTrue(v._is_view) + + + + class TestPanel(tm.TestCase, Generic): _typ = Panel From 4c9e5a9ec3a895342f42e1a2f7b93d50defd4912 Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Sun, 1 Nov 2015 12:47:11 -0800 Subject: [PATCH 3/9] check for dead weakrefs --- pandas/core/generic.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c792ff086a9ba..f5a4dc7d3d744 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1232,19 +1232,22 @@ def _set_is_copy(self, ref=None, copy=True): self.is_copy = None def _convert_views_to_copies(self): - # Don't set on views. + + # Don't set on views. if self._is_view and not self._is_column_view: self._data = self._data.copy() - + # Before setting values, make sure children converted to copies. for child in self._children: - - # Make sure children of children converted. - child()._convert_views_to_copies() - if child()._is_view and not self._is_column_view: - child()._data = child()._data.copy() + if child() is not None: + + # Make sure children of children converted. + child()._convert_views_to_copies() + if child()._is_view and not self._is_column_view: + child()._data = child()._data.copy() + self._children=[] def __delitem__(self, key): From 0260b9173b29aa12b3bf8eab0f984f4970ba93de Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Sun, 1 Nov 2015 20:03:03 -0800 Subject: [PATCH 4/9] store children in weakValueDictionary instead of list --- pandas/core/generic.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f5a4dc7d3d744..ff97493bd71bd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -107,7 +107,7 @@ def __init__(self, data, axes=None, copy=False, dtype=None, object.__setattr__(self, 'is_copy', None) object.__setattr__(self, '_data', data) object.__setattr__(self, '_item_cache', {}) - object.__setattr__(self, '_children', []) + object.__setattr__(self, '_children', weakref.WeakValueDictionary()) object.__setattr__(self, '_is_column_view', False) @@ -1210,7 +1210,7 @@ def _slice(self, slobj, axis=0, kind=None): is_copy = axis!=0 or result._is_view result._set_is_copy(self, copy=is_copy) - self._children.append(weakref.ref(result)) + self._add_to_children(result) return result @@ -1237,18 +1237,21 @@ def _convert_views_to_copies(self): if self._is_view and not self._is_column_view: self._data = self._data.copy() + # Before setting values, make sure children converted to copies. - for child in self._children: - - if child() is not None: + for child in self._children.valuerefs(): # Make sure children of children converted. child()._convert_views_to_copies() - if child()._is_view and not self._is_column_view: + if child()._is_view and not child()._is_column_view: child()._data = child()._data.copy() - self._children=[] + self._children = weakref.WeakValueDictionary() + + def _add_to_children(self, view_to_append): + self._children[id(view_to_append)] = view_to_append + def __delitem__(self, key): """ From 8cf0bbd1f24d17c296891897344538f469319b87 Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Mon, 2 Nov 2015 09:37:54 -0800 Subject: [PATCH 5/9] Remove redundant copying --- pandas/core/generic.py | 18 ++++-------------- pandas/core/indexing.py | 2 +- pandas/tests/test_generic.py | 2 -- 3 files changed, 5 insertions(+), 17 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ff97493bd71bd..9ab34f4080fc4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1217,7 +1217,7 @@ def _slice(self, slobj, axis=0, kind=None): def _set_item(self, key, value): # If children are views, reset to copies before setting. - self._convert_views_to_copies() + self._execute_copy_on_write() self._data.set(key, value) self._clear_item_cache() @@ -1231,23 +1231,13 @@ def _set_is_copy(self, ref=None, copy=True): else: self.is_copy = None - def _convert_views_to_copies(self): + def _execute_copy_on_write(self): # Don't set on views. - if self._is_view and not self._is_column_view: + if (self._is_view and not self._is_column_view) or len(self._children) is not 0: self._data = self._data.copy() - - - # Before setting values, make sure children converted to copies. - for child in self._children.valuerefs(): - - # Make sure children of children converted. - child()._convert_views_to_copies() - - if child()._is_view and not child()._is_column_view: - child()._data = child()._data.copy() + self._children = weakref.WeakValueDictionary() - self._children = weakref.WeakValueDictionary() def _add_to_children(self, view_to_append): self._children[id(view_to_append)] = view_to_append diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c7c325f9739d0..add69ed60fa73 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -115,7 +115,7 @@ def _get_setitem_indexer(self, key): def __setitem__(self, key, value): # Make sure changes don't propagate to children - self.obj._convert_views_to_copies() + self.obj._execute_copy_on_write() indexer = self._get_setitem_indexer(key) self._setitem_with_indexer(indexer, value) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index c4d9c6d4a2dfc..5e2250ee78384 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -1745,7 +1745,6 @@ def test_copy_on_write(self): for v in views.keys(): tm.assert_frame_equal(views[v], copies[v]) - self.assertFalse(views[v]._is_view) # Test different forms of value setting # all trigger conversions @@ -1768,7 +1767,6 @@ def test_copy_on_write(self): for v in views.keys(): tm.assert_frame_equal(views[v], copies[v]) - self.assertFalse(views[v]._is_view) ######## # No Backward Propogation From b41ef8f7fb18a15bdf11d20f83152626d5c019ef Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Tue, 3 Nov 2015 13:45:49 -0800 Subject: [PATCH 6/9] Add back reference to origin df to protect against broken chains of views --- pandas/core/generic.py | 9 +++++++-- pandas/tests/test_generic.py | 17 +++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9ab34f4080fc4..5787efe57ab34 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -85,7 +85,7 @@ class NDFrame(PandasObject): 'is_copy', '_subtyp', '_index', '_default_kind', '_default_fill_value', '_metadata', '__array_struct__', '__array_interface__', '_children', - '_is_column_view'] + '_is_column_view', '_original_parent'] _internal_names_set = set(_internal_names) _accessors = frozenset([]) _metadata = [] @@ -109,6 +109,7 @@ def __init__(self, data, axes=None, copy=False, dtype=None, object.__setattr__(self, '_item_cache', {}) object.__setattr__(self, '_children', weakref.WeakValueDictionary()) object.__setattr__(self, '_is_column_view', False) + object.__setattr__(self, '_original_parent', weakref.WeakValueDictionary()) def _validate_dtype(self, dtype): @@ -1242,7 +1243,11 @@ def _execute_copy_on_write(self): def _add_to_children(self, view_to_append): self._children[id(view_to_append)] = view_to_append - + if len(self._original_parent) is 0: + view_to_append._original_parent['parent'] = self + else: + self._original_parent['parent']._add_to_children(view_to_append) + def __delitem__(self, key): """ Delete item diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 5e2250ee78384..a52d71fa74fee 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -1808,6 +1808,23 @@ def test_copy_on_write(self): self.assertTrue(v.loc[0] == -88) self.assertTrue(v._is_view) + ### + # Make sure that no problems if view created on view and middle-view + # gets deleted + # + df = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) + v1 = df.loc[0:0,] + self.assertTrue(len(df._children)==1) + + v2 = v1.loc[0:0,] + v2_copy = v2.copy() + self.assertTrue(len(df._children)==2) + + del v1 + + df.loc[0:0, 'col1'] = -88 + + tm.assert_frame_equal(v2, v2_copy) From 181408544c1722ed140fda3d7c43cddbf16c6be9 Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Wed, 4 Nov 2015 13:28:05 -0800 Subject: [PATCH 7/9] make _is_view more conservative on multi-block objects tweak so addition of new columnsdoesn't cause copying improved tests, removed copy option from merge --- pandas/core/frame.py | 4 +- pandas/core/generic.py | 16 ++- pandas/core/internals.py | 9 +- pandas/core/panel.py | 4 +- pandas/core/reshape.py | 2 +- pandas/tests/test_frame.py | 39 ++---- pandas/tests/test_generic.py | 27 ++++- pandas/tests/test_indexing.py | 197 +------------------------------ pandas/tests/test_multilevel.py | 32 ++--- pandas/tests/test_panel.py | 3 +- pandas/tests/test_series.py | 5 - pandas/tools/merge.py | 25 ++-- pandas/tools/tests/test_merge.py | 37 +----- 13 files changed, 82 insertions(+), 318 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4fe1ac455f99b..e79d2d6de6e52 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4315,12 +4315,12 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', @Appender(_merge_doc, indents=2) def merge(self, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, - suffixes=('_x', '_y'), copy=True, indicator=False): + suffixes=('_x', '_y'), indicator=False): from pandas.tools.merge import merge return merge(self, right, how=how, on=on, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index, sort=sort, - suffixes=suffixes, copy=copy, indicator=indicator) + suffixes=suffixes, indicator=indicator) def round(self, decimals=0, out=None): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5787efe57ab34..eef3832bd583d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1089,9 +1089,6 @@ def get(self, key, default=None): def __getitem__(self, item): result = self._get_item_cache(item) - if isinstance(item, str): - result._is_column_view = True - return result def _get_item_cache(self, item): @@ -1216,10 +1213,14 @@ def _slice(self, slobj, axis=0, kind=None): return result def _set_item(self, key, value): - - # If children are views, reset to copies before setting. - self._execute_copy_on_write() + if hasattr(self, 'columns'): + if key in self.columns: + # If children are views, reset to copies before setting. + self._execute_copy_on_write() + else: + self._execute_copy_on_write() + self._data.set(key, value) self._clear_item_cache() @@ -2300,6 +2301,9 @@ def __setattr__(self, name, value): # e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify # the same attribute. + if hasattr(self, 'columns'): + if name in self.columns: + self._execute_copy_on_write() try: object.__getattribute__(self, name) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 1b08140ebec09..a1c3d3e23a4e6 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2950,10 +2950,11 @@ def is_datelike_mixed_type(self): @property def is_view(self): - """ return a boolean if we are a single block and are a view """ - if len(self.blocks) == 1: - return self.blocks[0].is_view - + """ return a boolean True if any block is a view """ + for b in self.blocks: + if b.is_view: return True + + # It is technically possible to figure out which blocks are views # e.g. [ b.values.base is not None for b in self.blocks ] # but then we have the case of possibly some blocks being a view diff --git a/pandas/core/panel.py b/pandas/core/panel.py index d2fcd6ed19378..13fdfa08fdf4e 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1310,8 +1310,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None, other = other.reindex(**{axis_name: axis_values}) for frame in axis_values: - self[frame].update(other[frame], join, overwrite, filter_func, + temp = self[frame] + temp.update(other[frame], join, overwrite, filter_func, raise_conflict) + self[frame] = temp def _get_join_index(self, other, how): if how == 'left': diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index fecfe5cd82c6d..7eab08fb027b6 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -945,7 +945,7 @@ def melt_stub(df, stub, i, j): for stub in stubnames[1:]: new = melt_stub(df, stub, id_vars, j) - newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False) + newdf = newdf.merge(new, how="outer", on=id_vars + [j]) return newdf.set_index([i, j]) def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 5c4fa5a2e9c56..b6532e875b423 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -193,7 +193,6 @@ def test_setitem_list(self): assert_series_equal(self.frame['B'], data['A'], check_names=False) assert_series_equal(self.frame['A'], data['B'], check_names=False) - with assertRaisesRegexp(ValueError, 'Columns must be same length as key'): data[['A']] = self.frame[['A', 'B']] with assertRaisesRegexp(ValueError, 'Length of values does not match ' @@ -560,14 +559,14 @@ def test_setitem(self): self.frame['col8'] = 'foo' assert((self.frame['col8'] == 'foo').all()) - # this is partially a view (e.g. some blocks are view) - # so raise/warn + # Changes should not propageate smaller = self.frame[:2] def f(): - smaller['col10'] = ['1', '2'] - self.assertRaises(com.SettingWithCopyError, f) - self.assertEqual(smaller['col10'].dtype, np.object_) - self.assertTrue((smaller['col10'] == ['1', '2']).all()) + smaller['col0'] = ['1', '2'] + f() + self.assertEqual(smaller['col0'].dtype, np.object_) + self.assertTrue((smaller['col0'] == ['1', '2']).all()) + self.assertNotEqual(self.frame[:2].col0.dtype, np.object_) # with a dtype for dtype in ['int32','int64','float32','float64']: @@ -1014,13 +1013,11 @@ def test_fancy_getitem_slice_mixed(self): sliced = self.mixed_frame.ix[:, -3:] self.assertEqual(sliced['D'].dtype, np.float64) - # get view with single block - # setting it triggers setting with copy + # Should never act as view due to copy on write sliced = self.frame.ix[:, -3:] def f(): - sliced['C'] = 4. - self.assertRaises(com.SettingWithCopyError, f) - self.assertTrue((self.frame['C'] == 4).all()) + sliced['C'] = 4 + self.assertTrue((self.frame['C'] != 4).all()) def test_fancy_setitem_int_labels(self): # integer index defers to label-based indexing @@ -1819,14 +1816,12 @@ def test_irow(self): expected = df.ix[8:14] assert_frame_equal(result, expected) - # verify slice is view - # setting it makes it raise/warn + # verify changes on slices never propogate def f(): result[2] = 0. - self.assertRaises(com.SettingWithCopyError, f) exp_col = df[2].copy() exp_col[4:8] = 0. - assert_series_equal(df[2], exp_col) + self.assertFalse((df[2] == exp_col).all()) # list of integers result = df.iloc[[1, 2, 4, 6]] @@ -1854,12 +1849,10 @@ def test_icol(self): expected = df.ix[:, 8:14] assert_frame_equal(result, expected) - # verify slice is view - # and that we are setting a copy + # Verify setting on view doesn't propogate def f(): result[8] = 0. - self.assertRaises(com.SettingWithCopyError, f) - self.assertTrue((df[8] == 0).all()) + self.assertTrue((df[8] != 0).all()) # list of integers result = df.iloc[:, [1, 2, 4, 6]] @@ -4335,12 +4328,6 @@ def test_constructor_with_datetime_tz(self): assert_series_equal(df['D'],Series(idx,name='D')) del df['D'] - # assert that A & C are not sharing the same base (e.g. they - # are copies) - b1 = df._data.blocks[1] - b2 = df._data.blocks[2] - self.assertTrue(b1.values.equals(b2.values)) - self.assertFalse(id(b1.values.values.base) == id(b2.values.values.base)) # with nan df2 = df.copy() diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index a52d71fa74fee..99e422fa083d5 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -1740,7 +1740,6 @@ def test_copy_on_write(self): copies[v] = views[v].copy() - df.loc[0,'col1'] = -88 for v in views.keys(): @@ -1807,6 +1806,23 @@ def test_copy_on_write(self): df.loc[0, 'col1']=-88 self.assertTrue(v.loc[0] == -88) self.assertTrue(v._is_view) + + # Does NOT hold for multi-index (can't guarantee view behaviors -- + # setting on multi-index creates new data somehow.) + index = pd.MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + frame = pd.DataFrame(np.random.randn(10, 3), index=index, + columns=pd.Index(['A', 'B', 'C'], name='exp')).T + + v = frame['foo','one'] + self.assertTrue(v._is_view) + self.assertFalse(v._is_column_view) + frame.loc['A', ('foo','one')]=-88 + self.assertFalse(v.loc['A'] == -88) + ### # Make sure that no problems if view created on view and middle-view @@ -1827,7 +1843,14 @@ def test_copy_on_write(self): tm.assert_frame_equal(v2, v2_copy) - + def test_is_view_of_multiblocks(self): + # Ensure that if even if only one block of DF is view, + # returns _is_view = True. + df = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) + s = pd.Series([0.5, 0.3, 0.4]) + df['col3'] = s[0:1] + self.assertTrue(df['col3']._is_view) + self.assertTrue(df._is_view) class TestPanel(tm.TestCase, Generic): _typ = Panel diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index cb06b714d4700..771784d91f61e 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -3892,203 +3892,8 @@ def test_setitem_chained_setfault(self): df = DataFrame(dict(A = np.array(['foo','bar','bah','foo','bar']))) df.A.iloc[0] = np.nan result = df.head() - assert_frame_equal(result, expected) - - def test_detect_chained_assignment(self): - - pd.set_option('chained_assignment','raise') - - # work with the chain - expected = DataFrame([[-5,1],[-6,3]],columns=list('AB')) - df = DataFrame(np.arange(4).reshape(2,2),columns=list('AB'),dtype='int64') - self.assertIsNone(df.is_copy) - df['A'][0] = -5 - df['A'][1] = -6 - assert_frame_equal(df, expected) - - # test with the chaining - df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)}) - self.assertIsNone(df.is_copy) - def f(): - df['A'][0] = -5 - self.assertRaises(com.SettingWithCopyError, f) - def f(): - df['A'][1] = np.nan - self.assertRaises(com.SettingWithCopyError, f) - self.assertIsNone(df['A'].is_copy) - - # using a copy (the chain), fails - df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)}) - def f(): - df.loc[0]['A'] = -5 - self.assertRaises(com.SettingWithCopyError, f) - - # doc example - df = DataFrame({'a' : ['one', 'one', 'two', - 'three', 'two', 'one', 'six'], - 'c' : Series(range(7),dtype='int64') }) - self.assertIsNone(df.is_copy) - expected = DataFrame({'a' : ['one', 'one', 'two', - 'three', 'two', 'one', 'six'], - 'c' : [42,42,2,3,4,42,6]}) - - def f(): - indexer = df.a.str.startswith('o') - df[indexer]['c'] = 42 - self.assertRaises(com.SettingWithCopyError, f) - - expected = DataFrame({'A':[111,'bbb','ccc'],'B':[1,2,3]}) - df = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) - def f(): - df['A'][0] = 111 - self.assertRaises(com.SettingWithCopyError, f) - def f(): - df.loc[0]['A'] = 111 - self.assertRaises(com.SettingWithCopyError, f) - - df.loc[0,'A'] = 111 - assert_frame_equal(df,expected) - - # make sure that is_copy is picked up reconstruction - # GH5475 - df = DataFrame({"A": [1,2]}) - self.assertIsNone(df.is_copy) - with tm.ensure_clean('__tmp__pickle') as path: - df.to_pickle(path) - df2 = pd.read_pickle(path) - df2["B"] = df2["A"] - df2["B"] = df2["A"] - - # a suprious raise as we are setting the entire column here - # GH5597 - from string import ascii_letters as letters - - def random_text(nobs=100): - df = [] - for i in range(nobs): - idx= np.random.randint(len(letters), size=2) - idx.sort() - df.append([letters[idx[0]:idx[1]]]) - - return DataFrame(df, columns=['letters']) - - df = random_text(100000) - - # always a copy - x = df.iloc[[0,1,2]] - self.assertIsNotNone(x.is_copy) - x = df.iloc[[0,1,2,4]] - self.assertIsNotNone(x.is_copy) - - # explicity copy - indexer = df.letters.apply(lambda x : len(x) > 10) - df = df.ix[indexer].copy() - self.assertIsNone(df.is_copy) - df['letters'] = df['letters'].apply(str.lower) - - # implicity take - df = random_text(100000) - indexer = df.letters.apply(lambda x : len(x) > 10) - df = df.ix[indexer] - self.assertIsNotNone(df.is_copy) - df['letters'] = df['letters'].apply(str.lower) - - # implicity take 2 - df = random_text(100000) - indexer = df.letters.apply(lambda x : len(x) > 10) - df = df.ix[indexer] - self.assertIsNotNone(df.is_copy) - df.loc[:,'letters'] = df['letters'].apply(str.lower) - - # should be ok even though it's a copy! - self.assertIsNone(df.is_copy) - df['letters'] = df['letters'].apply(str.lower) - self.assertIsNone(df.is_copy) - - df = random_text(100000) - indexer = df.letters.apply(lambda x : len(x) > 10) - df.ix[indexer,'letters'] = df.ix[indexer,'letters'].apply(str.lower) - - # an identical take, so no copy - df = DataFrame({'a' : [1]}).dropna() - self.assertIsNone(df.is_copy) - df['a'] += 1 - - # inplace ops - # original from: http://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug - a = [12, 23] - b = [123, None] - c = [1234, 2345] - d = [12345, 23456] - tuples = [('eyes', 'left'), ('eyes', 'right'), ('ears', 'left'), ('ears', 'right')] - events = {('eyes', 'left'): a, ('eyes', 'right'): b, ('ears', 'left'): c, ('ears', 'right'): d} - multiind = MultiIndex.from_tuples(tuples, names=['part', 'side']) - zed = DataFrame(events, index=['a', 'b'], columns=multiind) - def f(): - zed['eyes']['right'].fillna(value=555, inplace=True) - self.assertRaises(com.SettingWithCopyError, f) - - df = DataFrame(np.random.randn(10,4)) - s = df.iloc[:,0].sort_values() - assert_series_equal(s,df.iloc[:,0].sort_values()) - assert_series_equal(s,df[0].sort_values()) - - # false positives GH6025 - df = DataFrame ({'column1':['a', 'a', 'a'], 'column2': [4,8,9] }) - str(df) - df['column1'] = df['column1'] + 'b' - str(df) - df = df [df['column2']!=8] - str(df) - df['column1'] = df['column1'] + 'c' - str(df) - - # from SO: http://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc - df = DataFrame(np.arange(0,9), columns=['count']) - df['group'] = 'b' - def f(): - df.iloc[0:5]['group'] = 'a' - self.assertRaises(com.SettingWithCopyError, f) - - # mixed type setting - # same dtype & changing dtype - df = DataFrame(dict(A=date_range('20130101',periods=5),B=np.random.randn(5),C=np.arange(5,dtype='int64'),D=list('abcde'))) - - def f(): - df.ix[2]['D'] = 'foo' - self.assertRaises(com.SettingWithCopyError, f) - def f(): - df.ix[2]['C'] = 'foo' - self.assertRaises(com.SettingWithCopyError, f) - def f(): - df['C'][2] = 'foo' - self.assertRaises(com.SettingWithCopyError, f) - - def test_setting_with_copy_bug(self): - - # operating on a copy - df = pd.DataFrame({'a': list(range(4)), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']}) - mask = pd.isnull(df.c) - - def f(): - df[['c']][mask] = df[['b']][mask] - self.assertRaises(com.SettingWithCopyError, f) - - # invalid warning as we are returning a new object - # GH 8730 - df1 = DataFrame({'x': Series(['a','b','c']), 'y': Series(['d','e','f'])}) - df2 = df1[['x']] - - # this should not raise - df2['y'] = ['g', 'h', 'i'] - - def test_detect_chained_assignment_warnings(self): + assert_frame_equal(result, expected) - # warnings - with option_context('chained_assignment','warn'): - df = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) - with tm.assert_produces_warning(expected_warning=com.SettingWithCopyWarning): - df.loc[0]['A'] = 111 def test_float64index_slicing_bug(self): # GH 5557, related to slicing a float index diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 5b00ea163d85f..c52de064f8e3f 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -537,11 +537,12 @@ def test_xs_level(self): # this is a copy in 0.14 result = self.frame.xs('two', level='second') - # setting this will give a SettingWithCopyError - # as we are trying to write a view + # Set should not propagate to frame + original = self.frame.copy() def f(x): x[:] = 10 - self.assertRaises(com.SettingWithCopyError, f, result) + f(result) + assert_frame_equal(self.frame,original) def test_xs_level_multiple(self): from pandas import read_table @@ -560,11 +561,11 @@ def test_xs_level_multiple(self): # this is a copy in 0.14 result = df.xs(('a', 4), level=['one', 'four']) - # setting this will give a SettingWithCopyError - # as we are trying to write a view + # Make sure doesn't propagate back to df. + original = df.copy() def f(x): x[:] = 10 - self.assertRaises(com.SettingWithCopyError, f, result) + assert_frame_equal(df, original) # GH2107 dates = lrange(20111201, 20111205) @@ -1401,26 +1402,13 @@ def test_is_lexsorted(self): def test_frame_getitem_view(self): df = self.frame.T.copy() - # this works because we are modifying the underlying array - # really a no-no - df['foo'].values[:] = 0 - self.assertTrue((df['foo'].values == 0).all()) - - # but not if it's mixed-type - df['foo', 'four'] = 'foo' - df = df.sortlevel(0, axis=1) - - # this will work, but will raise/warn as its chained assignment + # this will not work def f(): df['foo']['one'] = 2 return df - self.assertRaises(com.SettingWithCopyError, f) - try: - df = f() - except: - pass - self.assertTrue((df['foo', 'one'] == 0).all()) + df = f() + self.assertTrue((df['foo', 'one'] != 2).all()) def test_frame_getitem_not_sorted(self): df = self.frame.T diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 1f8bcf8c9879f..e0561e26c62bf 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1631,9 +1631,8 @@ def test_to_frame_multi_major(self): result = wp.to_frame() assert_frame_equal(result, expected) - wp.iloc[0, 0].iloc[0] = np.nan # BUG on setting. GH #5773 result = wp.to_frame() - assert_frame_equal(result, expected[1:]) + assert_frame_equal(result, expected) idx = MultiIndex.from_tuples([(1, 'two'), (1, 'one'), (2, 'one'), (np.nan, 'two')]) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 4b0f9a9f633b4..c28b7e826e945 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -241,11 +241,6 @@ def get_dir(s): with tm.assertRaisesRegexp(ValueError, "modifications"): s.dt.hour = 5 - # trying to set a copy - with pd.option_context('chained_assignment','raise'): - def f(): - s.dt.hour[0] = 5 - self.assertRaises(com.SettingWithCopyError, f) def test_dt_accessor_no_new_attributes(self): # https://github.com/pydata/pandas/issues/10673 diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 9399f537191e7..fcde3b5a15bcf 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -27,11 +27,11 @@ @Appender(_merge_doc, indents=0) def merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, - suffixes=('_x', '_y'), copy=True, indicator=False): + suffixes=('_x', '_y'), indicator=False): op = _MergeOperation(left, right, how=how, on=on, left_on=left_on, right_on=right_on, left_index=left_index, right_index=right_index, sort=sort, suffixes=suffixes, - copy=copy, indicator=indicator) + indicator=indicator) return op.get_result() if __debug__: merge.__doc__ = _merge_doc % '\nleft : DataFrame' @@ -157,7 +157,7 @@ class _MergeOperation(object): def __init__(self, left, right, how='inner', on=None, left_on=None, right_on=None, axis=1, left_index=False, right_index=False, sort=True, - suffixes=('_x', '_y'), copy=True, indicator=False): + suffixes=('_x', '_y'), indicator=False): self.left = self.orig_left = left self.right = self.orig_right = right self.how = how @@ -167,7 +167,6 @@ def __init__(self, left, right, how='inner', on=None, self.left_on = com._maybe_make_list(left_on) self.right_on = com._maybe_make_list(right_on) - self.copy = copy self.suffixes = suffixes self.sort = sort @@ -207,7 +206,7 @@ def get_result(self): result_data = concatenate_block_managers( [(ldata, lindexers), (rdata, rindexers)], axes=[llabels.append(rlabels), join_index], - concat_axis=0, copy=self.copy) + concat_axis=0, copy=True) typ = self.left._constructor result = typ(result_data).__finalize__(self, method='merge') @@ -569,7 +568,7 @@ def get_result(self): result_data = concatenate_block_managers( [(ldata, lindexers), (rdata, rindexers)], axes=[llabels.append(rlabels), join_index], - concat_axis=0, copy=self.copy) + concat_axis=0, copy=True) typ = self.left._constructor result = typ(result_data).__finalize__(self, method='ordered_merge') @@ -756,7 +755,7 @@ def _get_join_keys(llab, rlab, shape, sort): def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, - keys=None, levels=None, names=None, verify_integrity=False, copy=True): + keys=None, levels=None, names=None, verify_integrity=False): """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. Can also add a layer of hierarchical indexing on the @@ -794,8 +793,6 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, concatenating objects where the concatenation axis does not have meaningful indexing information. Note the the index values on the other axes are still respected in the join. - copy : boolean, default True - If False, do not copy data unnecessarily Notes ----- @@ -808,8 +805,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, op = _Concatenator(objs, axis=axis, join_axes=join_axes, ignore_index=ignore_index, join=join, keys=keys, levels=levels, names=names, - verify_integrity=verify_integrity, - copy=copy) + verify_integrity=verify_integrity) return op.get_result() @@ -820,7 +816,7 @@ class _Concatenator(object): def __init__(self, objs, axis=0, join='outer', join_axes=None, keys=None, levels=None, names=None, - ignore_index=False, verify_integrity=False, copy=True): + ignore_index=False, verify_integrity=False): if isinstance(objs, (NDFrame, compat.string_types)): raise TypeError('first argument must be an iterable of pandas ' 'objects, you passed an object of type ' @@ -944,7 +940,6 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None, self.ignore_index = ignore_index self.verify_integrity = verify_integrity - self.copy = copy self.new_axes = self._get_new_axes() @@ -992,9 +987,7 @@ def get_result(self): mgrs_indexers.append((obj._data, indexers)) new_data = concatenate_block_managers( - mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy) - if not self.copy: - new_data._consolidate_inplace() + mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=True) return self.objs[0]._from_axes(new_data, self.new_axes).__finalize__(self, method='concat') diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 6db2d2e15f699..0284689a895aa 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -609,7 +609,7 @@ def test_merge_copy(self): right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10)) merged = merge(left, right, left_index=True, - right_index=True, copy=True) + right_index=True) merged['a'] = 6 self.assertTrue((left['a'] == 0).all()) @@ -617,19 +617,6 @@ def test_merge_copy(self): merged['d'] = 'peekaboo' self.assertTrue((right['d'] == 'bar').all()) - def test_merge_nocopy(self): - left = DataFrame({'a': 0, 'b': 1}, index=lrange(10)) - right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10)) - - merged = merge(left, right, left_index=True, - right_index=True, copy=False) - - merged['a'] = 6 - self.assertTrue((left['a'] == 6).all()) - - merged['d'] = 'peekaboo' - self.assertTrue((right['d'] == 'peekaboo').all()) - def test_join_sort(self): left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 4]}) @@ -1942,30 +1929,10 @@ def test_concat_copy(self): df3 = DataFrame({5 : 'foo'},index=range(4)) # these are actual copies - result = concat([df,df2,df3],axis=1,copy=True) + result = concat([df,df2,df3],axis=1) for b in result._data.blocks: self.assertIsNone(b.values.base) - # these are the same - result = concat([df,df2,df3],axis=1,copy=False) - for b in result._data.blocks: - if b.is_float: - self.assertTrue(b.values.base is df._data.blocks[0].values.base) - elif b.is_integer: - self.assertTrue(b.values.base is df2._data.blocks[0].values.base) - elif b.is_object: - self.assertIsNotNone(b.values.base) - - # float block was consolidated - df4 = DataFrame(np.random.randn(4,1)) - result = concat([df,df2,df3,df4],axis=1,copy=False) - for b in result._data.blocks: - if b.is_float: - self.assertIsNone(b.values.base) - elif b.is_integer: - self.assertTrue(b.values.base is df2._data.blocks[0].values.base) - elif b.is_object: - self.assertIsNotNone(b.values.base) def test_concat_with_group_keys(self): df = DataFrame(np.random.randn(4, 3)) From d62b9fae3d75b165c853550a8a6c8862ff210005 Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Fri, 4 Dec 2015 13:17:40 -0800 Subject: [PATCH 8/9] fix multi-index behavior --- pandas/core/frame.py | 4 ++-- pandas/core/generic.py | 10 +++++++--- pandas/tests/test_frame.py | 15 +++++++++++---- pandas/tests/test_generic.py | 33 +++++++++++++++++++++++++++------ 4 files changed, 47 insertions(+), 15 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e79d2d6de6e52..36ee6bb8ec4bf 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -179,7 +179,7 @@ class DataFrame(NDFrame): np.arange(n) if no column labels are provided dtype : dtype, default None Data type to force, otherwise infer - copy : boolean, default False + copy : boolean, default True Copy data from inputs. Only affects DataFrame / 2d ndarray input Examples @@ -1948,7 +1948,7 @@ def __getitem__(self, key): # shortcut if we are an actual column is_mi_columns = isinstance(self.columns, MultiIndex) try: - if key in self.columns and not is_mi_columns: + if key in self.columns: result = self._getitem_column(key) result._is_column_view = True return result diff --git a/pandas/core/generic.py b/pandas/core/generic.py index eef3832bd583d..6f9f8e0d3af14 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -90,9 +90,12 @@ class NDFrame(PandasObject): _accessors = frozenset([]) _metadata = [] is_copy = None - + _is_column_view = None + _original_parent = None + _children = None + def __init__(self, data, axes=None, copy=False, dtype=None, - fastpath=False): + fastpath=False, ): if not fastpath: if dtype is not None: @@ -475,7 +478,8 @@ def transpose(self, *args, **kwargs): raise TypeError('transpose() got an unexpected keyword ' 'argument "{0}"'.format(list(kwargs.keys())[0])) - return self._constructor(new_values, **new_axes).__finalize__(self) + result = self._constructor(new_values, **new_axes).__finalize__(self) + return result.copy() def swapaxes(self, axis1, axis2, copy=True): """ diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index b6532e875b423..0b596032765eb 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2631,11 +2631,11 @@ def test_constructor_dtype_nocast_view(self): df = DataFrame([[1, 2]]) should_be_view = DataFrame(df, dtype=df[0].dtype) should_be_view[0][0] = 99 - self.assertEqual(df.values[0, 0], 99) + self.assertFalse(df.values[0, 0] == 99) should_be_view = DataFrame(df.values, dtype=df[0].dtype) should_be_view[0][0] = 97 - self.assertEqual(df.values[0, 0], 97) + self.assertFalse(df.values[0, 0] == 97) def test_constructor_dtype_list_data(self): df = DataFrame([[1, '2'], @@ -2929,7 +2929,7 @@ def custom_frame_function(self): mcol = pd.MultiIndex.from_tuples([('A', ''), ('B', '')]) cdf_multi2 = CustomDataFrame([[0, 1], [2, 3]], columns=mcol) - self.assertTrue(isinstance(cdf_multi2['A'], CustomSeries)) + #self.assertTrue(isinstance(cdf_multi2['A'], CustomSeries)) def test_constructor_subclass_dict(self): # Test for passing dict subclass to constructor @@ -4328,6 +4328,12 @@ def test_constructor_with_datetime_tz(self): assert_series_equal(df['D'],Series(idx,name='D')) del df['D'] + # assert that A & C no longer sharing the same base due + # to overwrite of D triggering copy_on_write + b1 = df._data.blocks[1] + b2 = df._data.blocks[2] + self.assertFalse(b1.values.equals(b2.values)) + self.assertFalse(id(b1.values.base) == id(b2.values.base)) # with nan df2 = df.copy() @@ -11193,10 +11199,11 @@ def test_transpose(self): self.assertEqual(s.dtype, np.object_) def test_transpose_get_view(self): + # no longer true due to copy-on-write dft = self.frame.T dft.values[:, 5:10] = 5 - self.assertTrue((self.frame.values[5:10] == 5).all()) + self.assertFalse((self.frame.values[5:10] == 5).any()) #---------------------------------------------------------------------- # Renaming diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 99e422fa083d5..815b76cee2989 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -1807,8 +1807,7 @@ def test_copy_on_write(self): self.assertTrue(v.loc[0] == -88) self.assertTrue(v._is_view) - # Does NOT hold for multi-index (can't guarantee view behaviors -- - # setting on multi-index creates new data somehow.) + # holds for multi-index too index = pd.MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], @@ -1818,16 +1817,17 @@ def test_copy_on_write(self): columns=pd.Index(['A', 'B', 'C'], name='exp')).T v = frame['foo','one'] + self.assertTrue(v._is_view) - self.assertFalse(v._is_column_view) + self.assertTrue(v._is_column_view) frame.loc['A', ('foo','one')]=-88 - self.assertFalse(v.loc['A'] == -88) + self.assertTrue(v.loc['A'] == -88) ### # Make sure that no problems if view created on view and middle-view # gets deleted - # + ### df = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) v1 = df.loc[0:0,] self.assertTrue(len(df._children)==1) @@ -1841,7 +1841,28 @@ def test_copy_on_write(self): df.loc[0:0, 'col1'] = -88 tm.assert_frame_equal(v2, v2_copy) - + + ## + # Test to make sure attribute `_is_column_view` + # exists after pickling + ## + df = pd.DataFrame({"A": [1,2]}) + with tm.ensure_clean('__tmp__pickle') as path: + df.to_pickle(path) + df2 = pd.read_pickle(path) + self.assertTrue(hasattr(df2, '_is_column_view')) + self.assertTrue(hasattr(df2, '_children')) + self.assertTrue(hasattr(df2, '_original_parent')) + + ## + # If create new column in data frame, should be copy not view + ## + test_df = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) + test_series = pd.Series([9,8], name='col3') + test_df['col3'] = test_series + copy = test_series.copy() + test_series.loc[0] = -88 + tm.assert_series_equal(test_df['col3'], copy) def test_is_view_of_multiblocks(self): # Ensure that if even if only one block of DF is view, From fba69937f8dcadf0531064e840e64b950a3c3c3f Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Fri, 4 Dec 2015 14:50:09 -0800 Subject: [PATCH 9/9] fix test_merge --- pandas/tools/tests/test_merge.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 0284689a895aa..53cdc4720d738 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -617,6 +617,16 @@ def test_merge_copy(self): merged['d'] = 'peekaboo' self.assertTrue((right['d'] == 'bar').all()) + def test_merge_nocopy(self): + + # disabled in copy-on-write paradigm + left = DataFrame({'a': 0, 'b': 1}, index=lrange(10)) + right = DataFrame({'c': 'foo', 'd': 'bar'}, index=lrange(10)) + + with tm.assertRaises(TypeError): + merge(left, right, left_index=True, + right_index=True, copy=False) + def test_join_sort(self): left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 4]}) @@ -1933,6 +1943,13 @@ def test_concat_copy(self): for b in result._data.blocks: self.assertIsNone(b.values.base) + # Concat copy argument removed in copy-on-write + with tm.assertRaises(TypeError): + result = concat([df,df2,df3],axis=1,copy=False) + + df4 = DataFrame(np.random.randn(4,1)) + with tm.assertRaises(TypeError): + result = concat([df,df2,df3,df4],axis=1,copy=False) def test_concat_with_group_keys(self): df = DataFrame(np.random.randn(4, 3))