BUG: fix item cache invalidation bug in pandas-dev#3970 caused by is_mixed_type silently consolidating (hurf). also fix stable sorting bug presenting on my machine

wesm · wesm · commit 2041d721baa5 · 2013-06-28T10:55:40.000-07:00
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2100,7 +2100,15 @@ def _slice(self, slobj, axis=0, raise_on_error=False):
         else:
             mgr_axis = 0
 
-        new_data = self._data.get_slice(slobj, axis=mgr_axis, raise_on_error=raise_on_error)
+        # Super bad smell, need to review all this cache inval / block business
+        blocks_before = len(self._data.blocks)
+        new_data = self._data.get_slice(slobj, axis=mgr_axis,
+                                        raise_on_error=raise_on_error)
+
+        # Internal consolidation requires cache invalidation
+        if len(self._data.blocks) != blocks_before:
+            self._clear_item_cache()
+
         return self._constructor(new_data)
 
     def _box_item_values(self, key, values):
@@ -3226,7 +3234,8 @@ def sort(self, columns=None, column=None, axis=0, ascending=True,
         return self.sort_index(by=columns, axis=axis, ascending=ascending,
                                inplace=inplace)
 
-    def sort_index(self, axis=0, by=None, ascending=True, inplace=False):
+    def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
+                   kind='quicksort'):
         """
         Sort DataFrame either by labels (along either axis) or by the values in
         a column
@@ -3263,7 +3272,10 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False):
         if by is not None:
             if axis != 0:
                 raise AssertionError('Axis must be 0')
-            if isinstance(by, (tuple, list)):
+            if not isinstance(by, (tuple, list)):
+                by = [by]
+
+            if len(by) > 1:
                 keys = []
                 for x in by:
                     k = self[x].values
@@ -3281,18 +3293,19 @@ def trans(v):
                 indexer = _lexsort_indexer(keys, orders=ascending)
                 indexer = com._ensure_platform_int(indexer)
             else:
+                by = by[0]
                 k = self[by].values
                 if k.ndim == 2:
                     raise ValueError('Cannot sort by duplicate column %s'
                                      % str(by))
-                indexer = k.argsort()
+                indexer = k.argsort(kind=kind)
                 if not ascending:
                     indexer = indexer[::-1]
         elif isinstance(labels, MultiIndex):
             indexer = _lexsort_indexer(labels.labels, orders=ascending)
             indexer = com._ensure_platform_int(indexer)
         else:
-            indexer = labels.argsort()
+            indexer = labels.argsort(kind=kind)
             if not ascending:
                 indexer = indexer[::-1]
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -775,11 +775,20 @@ def consolidate(self, inplace=False):
 
     @property
     def _is_mixed_type(self):
-        return self._data.is_mixed_type
+        f = lambda: self._data.is_mixed_type
+        return self._protect_consolidate(f)
 
     @property
     def _is_numeric_mixed_type(self):
-        return self._data.is_numeric_mixed_type
+        f = lambda: self._data.is_numeric_mixed_type
+        return self._protect_consolidate(f)
+
+    def _protect_consolidate(self, f):
+        blocks_before = len(self._data.blocks)
+        result = f()
+        if len(self._data.blocks) != blocks_before:
+            self._clear_item_cache()
+        return result
 
     def _reindex_axis(self, new_index, fill_method, axis, copy):
         new_data = self._data.reindex_axis(new_index, axis=axis,
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -168,7 +168,7 @@ def reindex_items_from(self, new_ref_items, copy=True):
         reindexed : Block
         """
         new_ref_items, indexer = self.items.reindex(new_ref_items)
-            
+
         if indexer is None:
             new_items = new_ref_items
             new_values = self.values.copy() if copy else self.values
@@ -1074,7 +1074,7 @@ def _set_ref_locs(self, labels=None, do_refs=False):
         if we have a non-unique index on this axis, set the indexers
         we need to set an absolute indexer for the blocks
         return the indexer if we are not unique
-        
+
         labels : the (new) labels for this manager
         ref    : boolean, whether to set the labels (one a 1-1 mapping)
 
@@ -1088,7 +1088,7 @@ def _set_ref_locs(self, labels=None, do_refs=False):
         if is_unique and not do_refs:
 
             if not self.items.is_unique:
-                
+
                 # reset our ref locs
                 self._ref_locs = None
                 for b in self.blocks:
@@ -1130,12 +1130,12 @@ def _set_ref_locs(self, labels=None, do_refs=False):
             self._ref_locs = rl
             return rl
 
-        # return our cached _ref_locs (or will compute again 
+        # return our cached _ref_locs (or will compute again
         # when we recreate the block manager if needed
         return getattr(self,'_ref_locs',None)
 
     def get_items_map(self, use_cached=True):
-        """ 
+        """
         return an inverted ref_loc map for an item index
         block -> item (in that block) location -> column location
 
@@ -1166,7 +1166,7 @@ def get_items_map(self, use_cached=True):
         else:
 
             for i, (block, idx) in enumerate(rl):
-                
+
                 m = maybe_create_block_in_items_map(im,block)
                 m[idx] = i
 
@@ -1362,11 +1362,13 @@ def _consolidate_check(self):
 
     @property
     def is_mixed_type(self):
+        # Warning, consolidation needs to get checked upstairs
         self._consolidate_inplace()
         return len(self.blocks) > 1
 
     @property
     def is_numeric_mixed_type(self):
+        # Warning, consolidation needs to get checked upstairs
         self._consolidate_inplace()
         return all([ block.is_numeric for block in self.blocks ])
 
@@ -1438,9 +1440,9 @@ def get_slice(self, slobj, axis=0, raise_on_error=False):
             new_items = new_axes[0]
             if len(self.blocks) == 1:
                 blk = self.blocks[0]
-                newb = make_block(blk.values[slobj], 
+                newb = make_block(blk.values[slobj],
+                                  new_items,
                                   new_items,
-                                  new_items, 
                                   klass=blk.__class__,
                                   fastpath=True,
                                   placement=blk._ref_locs)
@@ -1462,9 +1464,9 @@ def _slice_blocks(self, slobj, axis):
         slicer = tuple(slicer)
 
         for block in self.blocks:
-            newb = make_block(block.values[slicer], 
+            newb = make_block(block.values[slicer],
                               block.items,
-                              block.ref_items, 
+                              block.ref_items,
                               klass=block.__class__,
                               fastpath=True,
                               placement=block._ref_locs)
@@ -1576,9 +1578,9 @@ def xs(self, key, axis=1, copy=True):
                 raise Exception('cannot get view of mixed-type or '
                                 'non-consolidated DataFrame')
             for blk in self.blocks:
-                newb = make_block(blk.values[slicer], 
-                                  blk.items, 
-                                  blk.ref_items, 
+                newb = make_block(blk.values[slicer],
+                                  blk.items,
+                                  blk.ref_items,
                                   klass=blk.__class__,
                                   fastpath=True)
                 new_blocks.append(newb)
@@ -1587,8 +1589,8 @@ def xs(self, key, axis=1, copy=True):
             vals = block.values[slicer]
             if copy:
                 vals = vals.copy()
-            new_blocks = [make_block(vals, 
-                                     self.items, 
+            new_blocks = [make_block(vals,
+                                     self.items,
                                      self.items,
                                      klass=block.__class__,
                                      fastpath=True)]
@@ -1637,7 +1639,6 @@ def consolidate(self):
 
     def _consolidate_inplace(self):
         if not self.is_consolidated():
-
             self.blocks = _consolidate(self.blocks, self.items)
 
             # reset our mappings
@@ -1703,7 +1704,7 @@ def delete(self, item):
         # dupe keys may return mask
         loc = _possibly_convert_to_indexer(loc)
         self._delete_from_all_blocks(loc, item)
-    
+
         # _ref_locs, and _items_map are good here
         new_items = self.items.delete(loc)
         self.set_items_norename(new_items)
@@ -1763,7 +1764,7 @@ def _set_item(item, arr):
                         if self.items.is_unique:
                             self._reset_ref_locs()
                             self._set_ref_locs(do_refs='force')
-                            
+
                     self._rebuild_ref_locs()
 
 
@@ -1893,7 +1894,7 @@ def _delete_from_block(self, i, item):
 
             # reset the ref_locs based on the now good block._ref_locs
             self._reset_ref_locs()
-            
+
     def _add_new_block(self, item, value, loc=None):
         # Do we care about dtype at the moment?
 
@@ -1919,7 +1920,7 @@ def _add_new_block(self, item, value, loc=None):
                     self._ref_locs[i] = self._ref_locs[i-1]
 
             self._ref_locs[loc] = (new_block, 0)
-            
+
             # and reset
             self._reset_ref_locs()
             self._set_ref_locs(do_refs=True)
@@ -2081,7 +2082,7 @@ def take(self, indexer, new_index=None, axis=1, verify=True):
         if new_index is None:
             new_index = self.axes[axis].take(indexer)
 
-        new_axes[axis] = new_index 
+        new_axes[axis] = new_index
         return self.apply('take',axes=new_axes,indexer=indexer,ref_items=new_axes[0],axis=axis)
 
     def merge(self, other, lsuffix=None, rsuffix=None):
@@ -2453,7 +2454,7 @@ def _lcd_dtype(l):
         if lcd.kind == 'u':
             return np.dtype('int%s' % (lcd.itemsize*8*2))
         return lcd
-    
+
     elif have_dt64 and not have_float and not have_complex:
         return np.dtype('M8[ns]')
     elif have_complex:
@@ -2500,7 +2501,7 @@ def _merge_blocks(blocks, items, dtype=None):
     new_ref_locs = [ b._ref_locs for b in blocks ]
     if all([ x is not None for x in new_ref_locs ]):
         new_block.set_ref_locs(np.concatenate(new_ref_locs))
-    return new_block    
+    return new_block
 
 
 def _block_shape(values, ndim=1, shape=None):
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -45,12 +45,13 @@ def _skip_if_no_scipy():
     except ImportError:
         raise nose.SkipTest
 
-#-------------------------------------------------------------------------------
+#---------------------------------------------------------------------
 # DataFrame test cases
 
 JOIN_TYPES = ['inner', 'outer', 'left', 'right']
 MIXED_FLOAT_DTYPES = ['float16','float32','float64']
-MIXED_INT_DTYPES   = ['uint8','uint16','uint32','uint64','int8','int16','int32','int64']
+MIXED_INT_DTYPES   = ['uint8','uint16','uint32','uint64','int8','int16',
+                      'int32','int64']
 
 def _check_mixed_float(df, dtype = None):
 
@@ -3188,7 +3189,8 @@ def test_operators_timedelta64(self):
         result = mixed.min(axis=1)
 
         # GH 3106
-        df = DataFrame({ 'time' : date_range('20130102',periods=5), 'time2' : date_range('20130105',periods=5) })
+        df = DataFrame({'time' : date_range('20130102',periods=5),
+                        'time2' : date_range('20130105',periods=5) })
         df['off1'] = df['time2']-df['time']
         self.assert_(df['off1'].dtype == 'timedelta64[ns]')
 
@@ -3197,6 +3199,24 @@ def test_operators_timedelta64(self):
         self.assertTrue(df['off1'].dtype == 'timedelta64[ns]')
         self.assertTrue(df['off2'].dtype == 'timedelta64[ns]')
 
+    def test__slice_consolidate_invalidate_item_cache(self):
+        # #3970
+        df = DataFrame({ "aa":range(5), "bb":[2.2]*5})
+
+        # Creates a second float block
+        df["cc"] = 0.0
+
+        # caches a reference to the 'bb' series
+        df["bb"]
+
+        # repr machinery triggers consolidation
+        repr(df)
+
+        # Assignment to wrong series
+        df['bb'].iloc[0] = 0.17
+        df._clear_item_cache()
+        self.assertAlmostEqual(df['bb'][0], 0.17)
+
     def test_new_empty_index(self):
         df1 = DataFrame(randn(0, 3))
         df2 = DataFrame(randn(0, 3))
@@ -7514,7 +7534,7 @@ def _safe_add(df):
             def is_ok(s):
                 return issubclass(s.dtype.type, (np.integer,np.floating)) and s.dtype != 'uint8'
             return DataFrame(dict([ (c,s+1) if is_ok(s) else (c,s) for c, s in df.iteritems() ]))
-        
+
         def _check_get(df, cond, check_dtypes = True):
             other1 = _safe_add(df)
             rs = df.where(cond, other1)
@@ -8483,7 +8503,10 @@ def test_sort_datetimes(self):
         df = DataFrame(['a','a','a','b','c','d','e','f','g'],
                        columns=['A'],
                        index=date_range('20130101',periods=9))
-        dts = [ Timestamp(x) for x in  ['2004-02-11','2004-01-21','2004-01-26','2005-09-20','2010-10-04','2009-05-12','2008-11-12','2010-09-28','2010-09-28'] ]
+        dts = [Timestamp(x)
+               for x in  ['2004-02-11','2004-01-21','2004-01-26',
+                          '2005-09-20','2010-10-04','2009-05-12',
+                          '2008-11-12','2010-09-28','2010-09-28']]
         df['B'] = dts[::2] + dts[1::2]
         df['C'] = 2.
         df['A1'] = 3.