Skip to content

Fix for #3970 cache invalidation bug #4077

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2100,7 +2100,15 @@ def _slice(self, slobj, axis=0, raise_on_error=False):
else:
mgr_axis = 0

new_data = self._data.get_slice(slobj, axis=mgr_axis, raise_on_error=raise_on_error)
# Super bad smell, need to review all this cache inval / block business
blocks_before = len(self._data.blocks)
new_data = self._data.get_slice(slobj, axis=mgr_axis,
raise_on_error=raise_on_error)

# Internal consolidation requires cache invalidation
if len(self._data.blocks) != blocks_before:
self._clear_item_cache()

return self._constructor(new_data)

def _box_item_values(self, key, values):
Expand Down Expand Up @@ -3226,7 +3234,8 @@ def sort(self, columns=None, column=None, axis=0, ascending=True,
return self.sort_index(by=columns, axis=axis, ascending=ascending,
inplace=inplace)

def sort_index(self, axis=0, by=None, ascending=True, inplace=False):
def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
kind='quicksort'):
"""
Sort DataFrame either by labels (along either axis) or by the values in
a column
Expand Down Expand Up @@ -3263,7 +3272,10 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False):
if by is not None:
if axis != 0:
raise AssertionError('Axis must be 0')
if isinstance(by, (tuple, list)):
if not isinstance(by, (tuple, list)):
by = [by]

if len(by) > 1:
keys = []
for x in by:
k = self[x].values
Expand All @@ -3281,18 +3293,19 @@ def trans(v):
indexer = _lexsort_indexer(keys, orders=ascending)
indexer = com._ensure_platform_int(indexer)
else:
by = by[0]
k = self[by].values
if k.ndim == 2:
raise ValueError('Cannot sort by duplicate column %s'
% str(by))
indexer = k.argsort()
indexer = k.argsort(kind=kind)
if not ascending:
indexer = indexer[::-1]
elif isinstance(labels, MultiIndex):
indexer = _lexsort_indexer(labels.labels, orders=ascending)
indexer = com._ensure_platform_int(indexer)
else:
indexer = labels.argsort()
indexer = labels.argsort(kind=kind)
if not ascending:
indexer = indexer[::-1]

Expand Down
13 changes: 11 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,11 +775,20 @@ def consolidate(self, inplace=False):

@property
def _is_mixed_type(self):
return self._data.is_mixed_type
f = lambda: self._data.is_mixed_type
return self._protect_consolidate(f)

@property
def _is_numeric_mixed_type(self):
return self._data.is_numeric_mixed_type
f = lambda: self._data.is_numeric_mixed_type
return self._protect_consolidate(f)

def _protect_consolidate(self, f):
blocks_before = len(self._data.blocks)
result = f()
if len(self._data.blocks) != blocks_before:
self._clear_item_cache()
return result

def _reindex_axis(self, new_index, fill_method, axis, copy):
new_data = self._data.reindex_axis(new_index, axis=axis,
Expand Down
47 changes: 24 additions & 23 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def reindex_items_from(self, new_ref_items, copy=True):
reindexed : Block
"""
new_ref_items, indexer = self.items.reindex(new_ref_items)

if indexer is None:
new_items = new_ref_items
new_values = self.values.copy() if copy else self.values
Expand Down Expand Up @@ -1074,7 +1074,7 @@ def _set_ref_locs(self, labels=None, do_refs=False):
if we have a non-unique index on this axis, set the indexers
we need to set an absolute indexer for the blocks
return the indexer if we are not unique

labels : the (new) labels for this manager
ref : boolean, whether to set the labels (one a 1-1 mapping)

Expand All @@ -1088,7 +1088,7 @@ def _set_ref_locs(self, labels=None, do_refs=False):
if is_unique and not do_refs:

if not self.items.is_unique:

# reset our ref locs
self._ref_locs = None
for b in self.blocks:
Expand Down Expand Up @@ -1130,12 +1130,12 @@ def _set_ref_locs(self, labels=None, do_refs=False):
self._ref_locs = rl
return rl

# return our cached _ref_locs (or will compute again
# return our cached _ref_locs (or will compute again
# when we recreate the block manager if needed
return getattr(self,'_ref_locs',None)

def get_items_map(self, use_cached=True):
"""
"""
return an inverted ref_loc map for an item index
block -> item (in that block) location -> column location

Expand Down Expand Up @@ -1166,7 +1166,7 @@ def get_items_map(self, use_cached=True):
else:

for i, (block, idx) in enumerate(rl):

m = maybe_create_block_in_items_map(im,block)
m[idx] = i

Expand Down Expand Up @@ -1362,11 +1362,13 @@ def _consolidate_check(self):

@property
def is_mixed_type(self):
# Warning, consolidation needs to get checked upstairs
self._consolidate_inplace()
return len(self.blocks) > 1

@property
def is_numeric_mixed_type(self):
# Warning, consolidation needs to get checked upstairs
self._consolidate_inplace()
return all([ block.is_numeric for block in self.blocks ])

Expand Down Expand Up @@ -1438,9 +1440,9 @@ def get_slice(self, slobj, axis=0, raise_on_error=False):
new_items = new_axes[0]
if len(self.blocks) == 1:
blk = self.blocks[0]
newb = make_block(blk.values[slobj],
newb = make_block(blk.values[slobj],
new_items,
new_items,
new_items,
klass=blk.__class__,
fastpath=True,
placement=blk._ref_locs)
Expand All @@ -1462,9 +1464,9 @@ def _slice_blocks(self, slobj, axis):
slicer = tuple(slicer)

for block in self.blocks:
newb = make_block(block.values[slicer],
newb = make_block(block.values[slicer],
block.items,
block.ref_items,
block.ref_items,
klass=block.__class__,
fastpath=True,
placement=block._ref_locs)
Expand Down Expand Up @@ -1576,9 +1578,9 @@ def xs(self, key, axis=1, copy=True):
raise Exception('cannot get view of mixed-type or '
'non-consolidated DataFrame')
for blk in self.blocks:
newb = make_block(blk.values[slicer],
blk.items,
blk.ref_items,
newb = make_block(blk.values[slicer],
blk.items,
blk.ref_items,
klass=blk.__class__,
fastpath=True)
new_blocks.append(newb)
Expand All @@ -1587,8 +1589,8 @@ def xs(self, key, axis=1, copy=True):
vals = block.values[slicer]
if copy:
vals = vals.copy()
new_blocks = [make_block(vals,
self.items,
new_blocks = [make_block(vals,
self.items,
self.items,
klass=block.__class__,
fastpath=True)]
Expand Down Expand Up @@ -1637,7 +1639,6 @@ def consolidate(self):

def _consolidate_inplace(self):
if not self.is_consolidated():

self.blocks = _consolidate(self.blocks, self.items)

# reset our mappings
Expand Down Expand Up @@ -1703,7 +1704,7 @@ def delete(self, item):
# dupe keys may return mask
loc = _possibly_convert_to_indexer(loc)
self._delete_from_all_blocks(loc, item)

# _ref_locs, and _items_map are good here
new_items = self.items.delete(loc)
self.set_items_norename(new_items)
Expand Down Expand Up @@ -1763,7 +1764,7 @@ def _set_item(item, arr):
if self.items.is_unique:
self._reset_ref_locs()
self._set_ref_locs(do_refs='force')

self._rebuild_ref_locs()


Expand Down Expand Up @@ -1893,7 +1894,7 @@ def _delete_from_block(self, i, item):

# reset the ref_locs based on the now good block._ref_locs
self._reset_ref_locs()

def _add_new_block(self, item, value, loc=None):
# Do we care about dtype at the moment?

Expand All @@ -1919,7 +1920,7 @@ def _add_new_block(self, item, value, loc=None):
self._ref_locs[i] = self._ref_locs[i-1]

self._ref_locs[loc] = (new_block, 0)

# and reset
self._reset_ref_locs()
self._set_ref_locs(do_refs=True)
Expand Down Expand Up @@ -2081,7 +2082,7 @@ def take(self, indexer, new_index=None, axis=1, verify=True):
if new_index is None:
new_index = self.axes[axis].take(indexer)

new_axes[axis] = new_index
new_axes[axis] = new_index
return self.apply('take',axes=new_axes,indexer=indexer,ref_items=new_axes[0],axis=axis)

def merge(self, other, lsuffix=None, rsuffix=None):
Expand Down Expand Up @@ -2453,7 +2454,7 @@ def _lcd_dtype(l):
if lcd.kind == 'u':
return np.dtype('int%s' % (lcd.itemsize*8*2))
return lcd

elif have_dt64 and not have_float and not have_complex:
return np.dtype('M8[ns]')
elif have_complex:
Expand Down Expand Up @@ -2500,7 +2501,7 @@ def _merge_blocks(blocks, items, dtype=None):
new_ref_locs = [ b._ref_locs for b in blocks ]
if all([ x is not None for x in new_ref_locs ]):
new_block.set_ref_locs(np.concatenate(new_ref_locs))
return new_block
return new_block


def _block_shape(values, ndim=1, shape=None):
Expand Down
33 changes: 28 additions & 5 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,13 @@ def _skip_if_no_scipy():
except ImportError:
raise nose.SkipTest

#-------------------------------------------------------------------------------
#---------------------------------------------------------------------
# DataFrame test cases

JOIN_TYPES = ['inner', 'outer', 'left', 'right']
MIXED_FLOAT_DTYPES = ['float16','float32','float64']
MIXED_INT_DTYPES = ['uint8','uint16','uint32','uint64','int8','int16','int32','int64']
MIXED_INT_DTYPES = ['uint8','uint16','uint32','uint64','int8','int16',
'int32','int64']

def _check_mixed_float(df, dtype = None):

Expand Down Expand Up @@ -3188,7 +3189,8 @@ def test_operators_timedelta64(self):
result = mixed.min(axis=1)

# GH 3106
df = DataFrame({ 'time' : date_range('20130102',periods=5), 'time2' : date_range('20130105',periods=5) })
df = DataFrame({'time' : date_range('20130102',periods=5),
'time2' : date_range('20130105',periods=5) })
df['off1'] = df['time2']-df['time']
self.assert_(df['off1'].dtype == 'timedelta64[ns]')

Expand All @@ -3197,6 +3199,24 @@ def test_operators_timedelta64(self):
self.assertTrue(df['off1'].dtype == 'timedelta64[ns]')
self.assertTrue(df['off2'].dtype == 'timedelta64[ns]')

def test__slice_consolidate_invalidate_item_cache(self):
# #3970
df = DataFrame({ "aa":range(5), "bb":[2.2]*5})

# Creates a second float block
df["cc"] = 0.0

# caches a reference to the 'bb' series
df["bb"]

# repr machinery triggers consolidation
repr(df)

# Assignment to wrong series
df['bb'].iloc[0] = 0.17
df._clear_item_cache()
self.assertAlmostEqual(df['bb'][0], 0.17)

def test_new_empty_index(self):
df1 = DataFrame(randn(0, 3))
df2 = DataFrame(randn(0, 3))
Expand Down Expand Up @@ -7514,7 +7534,7 @@ def _safe_add(df):
def is_ok(s):
return issubclass(s.dtype.type, (np.integer,np.floating)) and s.dtype != 'uint8'
return DataFrame(dict([ (c,s+1) if is_ok(s) else (c,s) for c, s in df.iteritems() ]))

def _check_get(df, cond, check_dtypes = True):
other1 = _safe_add(df)
rs = df.where(cond, other1)
Expand Down Expand Up @@ -8483,7 +8503,10 @@ def test_sort_datetimes(self):
df = DataFrame(['a','a','a','b','c','d','e','f','g'],
columns=['A'],
index=date_range('20130101',periods=9))
dts = [ Timestamp(x) for x in ['2004-02-11','2004-01-21','2004-01-26','2005-09-20','2010-10-04','2009-05-12','2008-11-12','2010-09-28','2010-09-28'] ]
dts = [Timestamp(x)
for x in ['2004-02-11','2004-01-21','2004-01-26',
'2005-09-20','2010-10-04','2009-05-12',
'2008-11-12','2010-09-28','2010-09-28']]
df['B'] = dts[::2] + dts[1::2]
df['C'] = 2.
df['A1'] = 3.
Expand Down