From 2041d721baa5d928be0ffcb057b9f377c5b6637c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 28 Jun 2013 10:55:40 -0700 Subject: [PATCH] BUG: fix item cache invalidation bug in #3970 caused by is_mixed_type silently consolidating (hurf). also fix stable sorting bug presenting on my machine --- pandas/core/frame.py | 23 +++++++++++++++---- pandas/core/generic.py | 13 +++++++++-- pandas/core/internals.py | 47 +++++++++++++++++++------------------- pandas/tests/test_frame.py | 33 ++++++++++++++++++++++---- 4 files changed, 81 insertions(+), 35 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 97ceddb73a10d..9b1f144eeecbf 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2100,7 +2100,15 @@ def _slice(self, slobj, axis=0, raise_on_error=False): else: mgr_axis = 0 - new_data = self._data.get_slice(slobj, axis=mgr_axis, raise_on_error=raise_on_error) + # Super bad smell, need to review all this cache inval / block business + blocks_before = len(self._data.blocks) + new_data = self._data.get_slice(slobj, axis=mgr_axis, + raise_on_error=raise_on_error) + + # Internal consolidation requires cache invalidation + if len(self._data.blocks) != blocks_before: + self._clear_item_cache() + return self._constructor(new_data) def _box_item_values(self, key, values): @@ -3226,7 +3234,8 @@ def sort(self, columns=None, column=None, axis=0, ascending=True, return self.sort_index(by=columns, axis=axis, ascending=ascending, inplace=inplace) - def sort_index(self, axis=0, by=None, ascending=True, inplace=False): + def sort_index(self, axis=0, by=None, ascending=True, inplace=False, + kind='quicksort'): """ Sort DataFrame either by labels (along either axis) or by the values in a column @@ -3263,7 +3272,10 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False): if by is not None: if axis != 0: raise AssertionError('Axis must be 0') - if isinstance(by, (tuple, list)): + if not isinstance(by, (tuple, list)): + by = [by] + + if len(by) > 1: keys = [] for x in by: k = self[x].values @@ -3281,18 +3293,19 @@ def trans(v): indexer = _lexsort_indexer(keys, orders=ascending) indexer = com._ensure_platform_int(indexer) else: + by = by[0] k = self[by].values if k.ndim == 2: raise ValueError('Cannot sort by duplicate column %s' % str(by)) - indexer = k.argsort() + indexer = k.argsort(kind=kind) if not ascending: indexer = indexer[::-1] elif isinstance(labels, MultiIndex): indexer = _lexsort_indexer(labels.labels, orders=ascending) indexer = com._ensure_platform_int(indexer) else: - indexer = labels.argsort() + indexer = labels.argsort(kind=kind) if not ascending: indexer = indexer[::-1] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 627a8ab825e5f..b90368ab7c4ef 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -775,11 +775,20 @@ def consolidate(self, inplace=False): @property def _is_mixed_type(self): - return self._data.is_mixed_type + f = lambda: self._data.is_mixed_type + return self._protect_consolidate(f) @property def _is_numeric_mixed_type(self): - return self._data.is_numeric_mixed_type + f = lambda: self._data.is_numeric_mixed_type + return self._protect_consolidate(f) + + def _protect_consolidate(self, f): + blocks_before = len(self._data.blocks) + result = f() + if len(self._data.blocks) != blocks_before: + self._clear_item_cache() + return result def _reindex_axis(self, new_index, fill_method, axis, copy): new_data = self._data.reindex_axis(new_index, axis=axis, diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c2af6e395a7e5..c5ebac85f0ac7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -168,7 +168,7 @@ def reindex_items_from(self, new_ref_items, copy=True): reindexed : Block """ new_ref_items, indexer = self.items.reindex(new_ref_items) - + if indexer is None: new_items = new_ref_items new_values = self.values.copy() if copy else self.values @@ -1074,7 +1074,7 @@ def _set_ref_locs(self, labels=None, do_refs=False): if we have a non-unique index on this axis, set the indexers we need to set an absolute indexer for the blocks return the indexer if we are not unique - + labels : the (new) labels for this manager ref : boolean, whether to set the labels (one a 1-1 mapping) @@ -1088,7 +1088,7 @@ def _set_ref_locs(self, labels=None, do_refs=False): if is_unique and not do_refs: if not self.items.is_unique: - + # reset our ref locs self._ref_locs = None for b in self.blocks: @@ -1130,12 +1130,12 @@ def _set_ref_locs(self, labels=None, do_refs=False): self._ref_locs = rl return rl - # return our cached _ref_locs (or will compute again + # return our cached _ref_locs (or will compute again # when we recreate the block manager if needed return getattr(self,'_ref_locs',None) def get_items_map(self, use_cached=True): - """ + """ return an inverted ref_loc map for an item index block -> item (in that block) location -> column location @@ -1166,7 +1166,7 @@ def get_items_map(self, use_cached=True): else: for i, (block, idx) in enumerate(rl): - + m = maybe_create_block_in_items_map(im,block) m[idx] = i @@ -1362,11 +1362,13 @@ def _consolidate_check(self): @property def is_mixed_type(self): + # Warning, consolidation needs to get checked upstairs self._consolidate_inplace() return len(self.blocks) > 1 @property def is_numeric_mixed_type(self): + # Warning, consolidation needs to get checked upstairs self._consolidate_inplace() return all([ block.is_numeric for block in self.blocks ]) @@ -1438,9 +1440,9 @@ def get_slice(self, slobj, axis=0, raise_on_error=False): new_items = new_axes[0] if len(self.blocks) == 1: blk = self.blocks[0] - newb = make_block(blk.values[slobj], + newb = make_block(blk.values[slobj], + new_items, new_items, - new_items, klass=blk.__class__, fastpath=True, placement=blk._ref_locs) @@ -1462,9 +1464,9 @@ def _slice_blocks(self, slobj, axis): slicer = tuple(slicer) for block in self.blocks: - newb = make_block(block.values[slicer], + newb = make_block(block.values[slicer], block.items, - block.ref_items, + block.ref_items, klass=block.__class__, fastpath=True, placement=block._ref_locs) @@ -1576,9 +1578,9 @@ def xs(self, key, axis=1, copy=True): raise Exception('cannot get view of mixed-type or ' 'non-consolidated DataFrame') for blk in self.blocks: - newb = make_block(blk.values[slicer], - blk.items, - blk.ref_items, + newb = make_block(blk.values[slicer], + blk.items, + blk.ref_items, klass=blk.__class__, fastpath=True) new_blocks.append(newb) @@ -1587,8 +1589,8 @@ def xs(self, key, axis=1, copy=True): vals = block.values[slicer] if copy: vals = vals.copy() - new_blocks = [make_block(vals, - self.items, + new_blocks = [make_block(vals, + self.items, self.items, klass=block.__class__, fastpath=True)] @@ -1637,7 +1639,6 @@ def consolidate(self): def _consolidate_inplace(self): if not self.is_consolidated(): - self.blocks = _consolidate(self.blocks, self.items) # reset our mappings @@ -1703,7 +1704,7 @@ def delete(self, item): # dupe keys may return mask loc = _possibly_convert_to_indexer(loc) self._delete_from_all_blocks(loc, item) - + # _ref_locs, and _items_map are good here new_items = self.items.delete(loc) self.set_items_norename(new_items) @@ -1763,7 +1764,7 @@ def _set_item(item, arr): if self.items.is_unique: self._reset_ref_locs() self._set_ref_locs(do_refs='force') - + self._rebuild_ref_locs() @@ -1893,7 +1894,7 @@ def _delete_from_block(self, i, item): # reset the ref_locs based on the now good block._ref_locs self._reset_ref_locs() - + def _add_new_block(self, item, value, loc=None): # Do we care about dtype at the moment? @@ -1919,7 +1920,7 @@ def _add_new_block(self, item, value, loc=None): self._ref_locs[i] = self._ref_locs[i-1] self._ref_locs[loc] = (new_block, 0) - + # and reset self._reset_ref_locs() self._set_ref_locs(do_refs=True) @@ -2081,7 +2082,7 @@ def take(self, indexer, new_index=None, axis=1, verify=True): if new_index is None: new_index = self.axes[axis].take(indexer) - new_axes[axis] = new_index + new_axes[axis] = new_index return self.apply('take',axes=new_axes,indexer=indexer,ref_items=new_axes[0],axis=axis) def merge(self, other, lsuffix=None, rsuffix=None): @@ -2453,7 +2454,7 @@ def _lcd_dtype(l): if lcd.kind == 'u': return np.dtype('int%s' % (lcd.itemsize*8*2)) return lcd - + elif have_dt64 and not have_float and not have_complex: return np.dtype('M8[ns]') elif have_complex: @@ -2500,7 +2501,7 @@ def _merge_blocks(blocks, items, dtype=None): new_ref_locs = [ b._ref_locs for b in blocks ] if all([ x is not None for x in new_ref_locs ]): new_block.set_ref_locs(np.concatenate(new_ref_locs)) - return new_block + return new_block def _block_shape(values, ndim=1, shape=None): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 7dacacd8ad1fd..2f95097d9ca57 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -45,12 +45,13 @@ def _skip_if_no_scipy(): except ImportError: raise nose.SkipTest -#------------------------------------------------------------------------------- +#--------------------------------------------------------------------- # DataFrame test cases JOIN_TYPES = ['inner', 'outer', 'left', 'right'] MIXED_FLOAT_DTYPES = ['float16','float32','float64'] -MIXED_INT_DTYPES = ['uint8','uint16','uint32','uint64','int8','int16','int32','int64'] +MIXED_INT_DTYPES = ['uint8','uint16','uint32','uint64','int8','int16', + 'int32','int64'] def _check_mixed_float(df, dtype = None): @@ -3188,7 +3189,8 @@ def test_operators_timedelta64(self): result = mixed.min(axis=1) # GH 3106 - df = DataFrame({ 'time' : date_range('20130102',periods=5), 'time2' : date_range('20130105',periods=5) }) + df = DataFrame({'time' : date_range('20130102',periods=5), + 'time2' : date_range('20130105',periods=5) }) df['off1'] = df['time2']-df['time'] self.assert_(df['off1'].dtype == 'timedelta64[ns]') @@ -3197,6 +3199,24 @@ def test_operators_timedelta64(self): self.assertTrue(df['off1'].dtype == 'timedelta64[ns]') self.assertTrue(df['off2'].dtype == 'timedelta64[ns]') + def test__slice_consolidate_invalidate_item_cache(self): + # #3970 + df = DataFrame({ "aa":range(5), "bb":[2.2]*5}) + + # Creates a second float block + df["cc"] = 0.0 + + # caches a reference to the 'bb' series + df["bb"] + + # repr machinery triggers consolidation + repr(df) + + # Assignment to wrong series + df['bb'].iloc[0] = 0.17 + df._clear_item_cache() + self.assertAlmostEqual(df['bb'][0], 0.17) + def test_new_empty_index(self): df1 = DataFrame(randn(0, 3)) df2 = DataFrame(randn(0, 3)) @@ -7514,7 +7534,7 @@ def _safe_add(df): def is_ok(s): return issubclass(s.dtype.type, (np.integer,np.floating)) and s.dtype != 'uint8' return DataFrame(dict([ (c,s+1) if is_ok(s) else (c,s) for c, s in df.iteritems() ])) - + def _check_get(df, cond, check_dtypes = True): other1 = _safe_add(df) rs = df.where(cond, other1) @@ -8483,7 +8503,10 @@ def test_sort_datetimes(self): df = DataFrame(['a','a','a','b','c','d','e','f','g'], columns=['A'], index=date_range('20130101',periods=9)) - dts = [ Timestamp(x) for x in ['2004-02-11','2004-01-21','2004-01-26','2005-09-20','2010-10-04','2009-05-12','2008-11-12','2010-09-28','2010-09-28'] ] + dts = [Timestamp(x) + for x in ['2004-02-11','2004-01-21','2004-01-26', + '2005-09-20','2010-10-04','2009-05-12', + '2008-11-12','2010-09-28','2010-09-28']] df['B'] = dts[::2] + dts[1::2] df['C'] = 2. df['A1'] = 3.