Skip to content

Copy on Write via weakrefs #11207

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 3 additions & 10 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1894,7 +1894,9 @@ def __getitem__(self, key):
is_mi_columns = isinstance(self.columns, MultiIndex)
try:
if key in self.columns and not is_mi_columns:
return self._getitem_column(key)
result = self._getitem_column(key)
result._is_column_view = True
return result
except:
pass

Expand Down Expand Up @@ -2244,7 +2246,6 @@ def __setitem__(self, key, value):
self._set_item(key, value)

def _setitem_slice(self, key, value):
self._check_setitem_copy()
self.ix._setitem_with_indexer(key, value)

def _setitem_array(self, key, value):
Expand All @@ -2255,7 +2256,6 @@ def _setitem_array(self, key, value):
(len(key), len(self.index)))
key = check_bool_indexer(self.index, key)
indexer = key.nonzero()[0]
self._check_setitem_copy()
self.ix._setitem_with_indexer(indexer, value)
else:
if isinstance(value, DataFrame):
Expand All @@ -2265,7 +2265,6 @@ def _setitem_array(self, key, value):
self[k1] = value[k2]
else:
indexer = self.ix._convert_to_indexer(key, axis=1)
self._check_setitem_copy()
self.ix._setitem_with_indexer((slice(None), indexer), value)

def _setitem_frame(self, key, value):
Expand All @@ -2275,7 +2274,6 @@ def _setitem_frame(self, key, value):
raise TypeError('Must pass DataFrame with boolean values only')

self._check_inplace_setting(value)
self._check_setitem_copy()
self.where(-key, value, inplace=True)

def _ensure_valid_index(self, value):
Expand Down Expand Up @@ -2311,11 +2309,6 @@ def _set_item(self, key, value):
value = self._sanitize_column(key, value)
NDFrame._set_item(self, key, value)

# check if we are modifying a copy
# try to set first as we want an invalid
# value exeption to occur first
if len(self):
self._check_setitem_copy()

def insert(self, loc, column, value, allow_duplicates=False):
"""
Expand Down
134 changes: 34 additions & 100 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ class NDFrame(PandasObject):
_internal_names = ['_data', '_cacher', '_item_cache', '_cache',
'is_copy', '_subtyp', '_index',
'_default_kind', '_default_fill_value', '_metadata',
'__array_struct__', '__array_interface__']
'__array_struct__', '__array_interface__', '_children',
'_is_column_view']
_internal_names_set = set(_internal_names)
_accessors = frozenset([])
_metadata = []
Expand All @@ -105,6 +106,9 @@ def __init__(self, data, axes=None, copy=False, dtype=None,
object.__setattr__(self, 'is_copy', None)
object.__setattr__(self, '_data', data)
object.__setattr__(self, '_item_cache', {})
object.__setattr__(self, '_children', [])
object.__setattr__(self, '_is_column_view', False)


def _validate_dtype(self, dtype):
""" validate the passed dtype """
Expand Down Expand Up @@ -1074,13 +1078,19 @@ def get(self, key, default=None):
-------
value : type of items contained in object
"""

try:
return self[key]
except (KeyError, ValueError, IndexError):
return default

def __getitem__(self, item):
return self._get_item_cache(item)
result = self._get_item_cache(item)

if isinstance(item, str):
result._is_column_view = True

return result

def _get_item_cache(self, item):
""" return the cached item, item represents a label indexer """
Expand Down Expand Up @@ -1174,9 +1184,6 @@ def _maybe_update_cacher(self, clear=False, verify_is_copy=True):
except:
pass

if verify_is_copy:
self._check_setitem_copy(stacklevel=5, t='referant')

if clear:
self._clear_item_cache()

Expand All @@ -1201,9 +1208,16 @@ def _slice(self, slobj, axis=0, kind=None):
# but only in a single-dtyped view slicable case
is_copy = axis!=0 or result._is_view
result._set_is_copy(self, copy=is_copy)

self._children.append(weakref.ref(result))

return result

def _set_item(self, key, value):

# If children are views, reset to copies before setting.
self._convert_views_to_copies()

self._data.set(key, value)
self._clear_item_cache()

Expand All @@ -1216,103 +1230,21 @@ def _set_is_copy(self, ref=None, copy=True):
else:
self.is_copy = None

def _check_is_chained_assignment_possible(self):
"""
check if we are a view, have a cacher, and are of mixed type
if so, then force a setitem_copy check

should be called just near setting a value

will return a boolean if it we are a view and are cached, but a single-dtype
meaning that the cacher should be updated following setting
"""
if self._is_view and self._is_cached:
ref = self._get_cacher()
if ref is not None and ref._is_mixed_type:
self._check_setitem_copy(stacklevel=4, t='referant', force=True)
return True
elif self.is_copy:
self._check_setitem_copy(stacklevel=4, t='referant')
return False

def _check_setitem_copy(self, stacklevel=4, t='setting', force=False):
"""

Parameters
----------
stacklevel : integer, default 4
the level to show of the stack when the error is output
t : string, the type of setting error
force : boolean, default False
if True, then force showing an error

validate if we are doing a settitem on a chained copy.

If you call this function, be sure to set the stacklevel such that the
user will see the error *at the level of setting*
def _convert_views_to_copies(self):
# Don't set on views.
if self._is_view and not self._is_column_view:
self._data = self._data.copy()

It is technically possible to figure out that we are setting on
a copy even WITH a multi-dtyped pandas object. In other words, some blocks
may be views while other are not. Currently _is_view will ALWAYS return False
for multi-blocks to avoid having to handle this case.
# Before setting values, make sure children converted to copies.
for child in self._children:

df = DataFrame(np.arange(0,9), columns=['count'])
df['group'] = 'b'

# this technically need not raise SettingWithCopy if both are view (which is not
# generally guaranteed but is usually True
# however, this is in general not a good practice and we recommend using .loc
df.iloc[0:5]['group'] = 'a'

"""

if force or self.is_copy:

value = config.get_option('mode.chained_assignment')
if value is None:
return

# see if the copy is not actually refererd; if so, then disolve
# the copy weakref
try:
gc.collect(2)
if not gc.get_referents(self.is_copy()):
self.is_copy = None
return
except:
pass

# we might be a false positive
try:
if self.is_copy().shape == self.shape:
self.is_copy = None
return
except:
pass

# a custom message
if isinstance(self.is_copy, string_types):
t = self.is_copy

elif t == 'referant':
t = ("\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame\n\n"
"See the caveats in the documentation: "
"http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy")

else:
t = ("\n"
"A value is trying to be set on a copy of a slice from a "
"DataFrame.\n"
"Try using .loc[row_indexer,col_indexer] = value instead\n\n"
"See the caveats in the documentation: "
"http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy")

if value == 'raise':
raise SettingWithCopyError(t)
elif value == 'warn':
warnings.warn(t, SettingWithCopyWarning, stacklevel=stacklevel)
# Make sure children of children converted.
child()._convert_views_to_copies()

if child()._is_view and not self._is_column_view:
child()._data = child()._data.copy()

self._children=[]

def __delitem__(self, key):
"""
Expand Down Expand Up @@ -2229,6 +2161,7 @@ def __finalize__(self, other, method=None, **kwargs):
return self

def __getattr__(self, name):

"""After regular attribute access, try looking up the name
This allows simpler access to columns for interactive use.
"""
Expand All @@ -2252,6 +2185,7 @@ def __setattr__(self, name, value):
# e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify
# the same attribute.


try:
object.__getattribute__(self, name)
return object.__setattr__(self, name, value)
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def __iter__(self):
raise NotImplementedError('ix is not iterable')

def __getitem__(self, key):

if type(key) is tuple:
try:
values = self.obj.get_value(*key)
Expand Down Expand Up @@ -111,6 +112,9 @@ def _get_setitem_indexer(self, key):
raise IndexingError(key)

def __setitem__(self, key, value):
# Make sure changes don't propagate to children
self.obj._convert_views_to_copies()

indexer = self._get_setitem_indexer(key)
self._setitem_with_indexer(indexer, value)

Expand Down Expand Up @@ -199,6 +203,7 @@ def _has_valid_positional_setitem_indexer(self, indexer):
def _setitem_with_indexer(self, indexer, value):
self._has_valid_setitem_indexer(indexer)


# also has the side effect of consolidating in-place
from pandas import Panel, DataFrame, Series
info_axis = self.obj._info_axis_number
Expand Down Expand Up @@ -508,8 +513,6 @@ def can_do_equal_len():
if isinstance(value, ABCPanel):
value = self._align_panel(indexer, value)

# check for chained assignment
self.obj._check_is_chained_assignment_possible()

# actually do the set
self.obj._consolidate_inplace()
Expand Down
3 changes: 0 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,10 +714,7 @@ def setitem(key, value):
self._set_with(key, value)

# do the setitem
cacher_needs_updating = self._check_is_chained_assignment_possible()
setitem(key, value)
if cacher_needs_updating:
self._maybe_update_cacher()

def _set_with_engine(self, key, value):
values = self._values
Expand Down
Loading