Skip to content

Commit 2041d72

Browse files
committed
BUG: fix item cache invalidation bug in pandas-dev#3970 caused by is_mixed_type silently consolidating (hurf). also fix stable sorting bug presenting on my machine
1 parent 99acda4 commit 2041d72

File tree

4 files changed

+81
-35
lines changed

4 files changed

+81
-35
lines changed

pandas/core/frame.py

+18-5
Original file line numberDiff line numberDiff line change
@@ -2100,7 +2100,15 @@ def _slice(self, slobj, axis=0, raise_on_error=False):
21002100
else:
21012101
mgr_axis = 0
21022102

2103-
new_data = self._data.get_slice(slobj, axis=mgr_axis, raise_on_error=raise_on_error)
2103+
# Super bad smell, need to review all this cache inval / block business
2104+
blocks_before = len(self._data.blocks)
2105+
new_data = self._data.get_slice(slobj, axis=mgr_axis,
2106+
raise_on_error=raise_on_error)
2107+
2108+
# Internal consolidation requires cache invalidation
2109+
if len(self._data.blocks) != blocks_before:
2110+
self._clear_item_cache()
2111+
21042112
return self._constructor(new_data)
21052113

21062114
def _box_item_values(self, key, values):
@@ -3226,7 +3234,8 @@ def sort(self, columns=None, column=None, axis=0, ascending=True,
32263234
return self.sort_index(by=columns, axis=axis, ascending=ascending,
32273235
inplace=inplace)
32283236

3229-
def sort_index(self, axis=0, by=None, ascending=True, inplace=False):
3237+
def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
3238+
kind='quicksort'):
32303239
"""
32313240
Sort DataFrame either by labels (along either axis) or by the values in
32323241
a column
@@ -3263,7 +3272,10 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False):
32633272
if by is not None:
32643273
if axis != 0:
32653274
raise AssertionError('Axis must be 0')
3266-
if isinstance(by, (tuple, list)):
3275+
if not isinstance(by, (tuple, list)):
3276+
by = [by]
3277+
3278+
if len(by) > 1:
32673279
keys = []
32683280
for x in by:
32693281
k = self[x].values
@@ -3281,18 +3293,19 @@ def trans(v):
32813293
indexer = _lexsort_indexer(keys, orders=ascending)
32823294
indexer = com._ensure_platform_int(indexer)
32833295
else:
3296+
by = by[0]
32843297
k = self[by].values
32853298
if k.ndim == 2:
32863299
raise ValueError('Cannot sort by duplicate column %s'
32873300
% str(by))
3288-
indexer = k.argsort()
3301+
indexer = k.argsort(kind=kind)
32893302
if not ascending:
32903303
indexer = indexer[::-1]
32913304
elif isinstance(labels, MultiIndex):
32923305
indexer = _lexsort_indexer(labels.labels, orders=ascending)
32933306
indexer = com._ensure_platform_int(indexer)
32943307
else:
3295-
indexer = labels.argsort()
3308+
indexer = labels.argsort(kind=kind)
32963309
if not ascending:
32973310
indexer = indexer[::-1]
32983311

pandas/core/generic.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -775,11 +775,20 @@ def consolidate(self, inplace=False):
775775

776776
@property
777777
def _is_mixed_type(self):
778-
return self._data.is_mixed_type
778+
f = lambda: self._data.is_mixed_type
779+
return self._protect_consolidate(f)
779780

780781
@property
781782
def _is_numeric_mixed_type(self):
782-
return self._data.is_numeric_mixed_type
783+
f = lambda: self._data.is_numeric_mixed_type
784+
return self._protect_consolidate(f)
785+
786+
def _protect_consolidate(self, f):
787+
blocks_before = len(self._data.blocks)
788+
result = f()
789+
if len(self._data.blocks) != blocks_before:
790+
self._clear_item_cache()
791+
return result
783792

784793
def _reindex_axis(self, new_index, fill_method, axis, copy):
785794
new_data = self._data.reindex_axis(new_index, axis=axis,

pandas/core/internals.py

+24-23
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def reindex_items_from(self, new_ref_items, copy=True):
168168
reindexed : Block
169169
"""
170170
new_ref_items, indexer = self.items.reindex(new_ref_items)
171-
171+
172172
if indexer is None:
173173
new_items = new_ref_items
174174
new_values = self.values.copy() if copy else self.values
@@ -1074,7 +1074,7 @@ def _set_ref_locs(self, labels=None, do_refs=False):
10741074
if we have a non-unique index on this axis, set the indexers
10751075
we need to set an absolute indexer for the blocks
10761076
return the indexer if we are not unique
1077-
1077+
10781078
labels : the (new) labels for this manager
10791079
ref : boolean, whether to set the labels (one a 1-1 mapping)
10801080
@@ -1088,7 +1088,7 @@ def _set_ref_locs(self, labels=None, do_refs=False):
10881088
if is_unique and not do_refs:
10891089

10901090
if not self.items.is_unique:
1091-
1091+
10921092
# reset our ref locs
10931093
self._ref_locs = None
10941094
for b in self.blocks:
@@ -1130,12 +1130,12 @@ def _set_ref_locs(self, labels=None, do_refs=False):
11301130
self._ref_locs = rl
11311131
return rl
11321132

1133-
# return our cached _ref_locs (or will compute again
1133+
# return our cached _ref_locs (or will compute again
11341134
# when we recreate the block manager if needed
11351135
return getattr(self,'_ref_locs',None)
11361136

11371137
def get_items_map(self, use_cached=True):
1138-
"""
1138+
"""
11391139
return an inverted ref_loc map for an item index
11401140
block -> item (in that block) location -> column location
11411141
@@ -1166,7 +1166,7 @@ def get_items_map(self, use_cached=True):
11661166
else:
11671167

11681168
for i, (block, idx) in enumerate(rl):
1169-
1169+
11701170
m = maybe_create_block_in_items_map(im,block)
11711171
m[idx] = i
11721172

@@ -1362,11 +1362,13 @@ def _consolidate_check(self):
13621362

13631363
@property
13641364
def is_mixed_type(self):
1365+
# Warning, consolidation needs to get checked upstairs
13651366
self._consolidate_inplace()
13661367
return len(self.blocks) > 1
13671368

13681369
@property
13691370
def is_numeric_mixed_type(self):
1371+
# Warning, consolidation needs to get checked upstairs
13701372
self._consolidate_inplace()
13711373
return all([ block.is_numeric for block in self.blocks ])
13721374

@@ -1438,9 +1440,9 @@ def get_slice(self, slobj, axis=0, raise_on_error=False):
14381440
new_items = new_axes[0]
14391441
if len(self.blocks) == 1:
14401442
blk = self.blocks[0]
1441-
newb = make_block(blk.values[slobj],
1443+
newb = make_block(blk.values[slobj],
1444+
new_items,
14421445
new_items,
1443-
new_items,
14441446
klass=blk.__class__,
14451447
fastpath=True,
14461448
placement=blk._ref_locs)
@@ -1462,9 +1464,9 @@ def _slice_blocks(self, slobj, axis):
14621464
slicer = tuple(slicer)
14631465

14641466
for block in self.blocks:
1465-
newb = make_block(block.values[slicer],
1467+
newb = make_block(block.values[slicer],
14661468
block.items,
1467-
block.ref_items,
1469+
block.ref_items,
14681470
klass=block.__class__,
14691471
fastpath=True,
14701472
placement=block._ref_locs)
@@ -1576,9 +1578,9 @@ def xs(self, key, axis=1, copy=True):
15761578
raise Exception('cannot get view of mixed-type or '
15771579
'non-consolidated DataFrame')
15781580
for blk in self.blocks:
1579-
newb = make_block(blk.values[slicer],
1580-
blk.items,
1581-
blk.ref_items,
1581+
newb = make_block(blk.values[slicer],
1582+
blk.items,
1583+
blk.ref_items,
15821584
klass=blk.__class__,
15831585
fastpath=True)
15841586
new_blocks.append(newb)
@@ -1587,8 +1589,8 @@ def xs(self, key, axis=1, copy=True):
15871589
vals = block.values[slicer]
15881590
if copy:
15891591
vals = vals.copy()
1590-
new_blocks = [make_block(vals,
1591-
self.items,
1592+
new_blocks = [make_block(vals,
1593+
self.items,
15921594
self.items,
15931595
klass=block.__class__,
15941596
fastpath=True)]
@@ -1637,7 +1639,6 @@ def consolidate(self):
16371639

16381640
def _consolidate_inplace(self):
16391641
if not self.is_consolidated():
1640-
16411642
self.blocks = _consolidate(self.blocks, self.items)
16421643

16431644
# reset our mappings
@@ -1703,7 +1704,7 @@ def delete(self, item):
17031704
# dupe keys may return mask
17041705
loc = _possibly_convert_to_indexer(loc)
17051706
self._delete_from_all_blocks(loc, item)
1706-
1707+
17071708
# _ref_locs, and _items_map are good here
17081709
new_items = self.items.delete(loc)
17091710
self.set_items_norename(new_items)
@@ -1763,7 +1764,7 @@ def _set_item(item, arr):
17631764
if self.items.is_unique:
17641765
self._reset_ref_locs()
17651766
self._set_ref_locs(do_refs='force')
1766-
1767+
17671768
self._rebuild_ref_locs()
17681769

17691770

@@ -1893,7 +1894,7 @@ def _delete_from_block(self, i, item):
18931894

18941895
# reset the ref_locs based on the now good block._ref_locs
18951896
self._reset_ref_locs()
1896-
1897+
18971898
def _add_new_block(self, item, value, loc=None):
18981899
# Do we care about dtype at the moment?
18991900

@@ -1919,7 +1920,7 @@ def _add_new_block(self, item, value, loc=None):
19191920
self._ref_locs[i] = self._ref_locs[i-1]
19201921

19211922
self._ref_locs[loc] = (new_block, 0)
1922-
1923+
19231924
# and reset
19241925
self._reset_ref_locs()
19251926
self._set_ref_locs(do_refs=True)
@@ -2081,7 +2082,7 @@ def take(self, indexer, new_index=None, axis=1, verify=True):
20812082
if new_index is None:
20822083
new_index = self.axes[axis].take(indexer)
20832084

2084-
new_axes[axis] = new_index
2085+
new_axes[axis] = new_index
20852086
return self.apply('take',axes=new_axes,indexer=indexer,ref_items=new_axes[0],axis=axis)
20862087

20872088
def merge(self, other, lsuffix=None, rsuffix=None):
@@ -2453,7 +2454,7 @@ def _lcd_dtype(l):
24532454
if lcd.kind == 'u':
24542455
return np.dtype('int%s' % (lcd.itemsize*8*2))
24552456
return lcd
2456-
2457+
24572458
elif have_dt64 and not have_float and not have_complex:
24582459
return np.dtype('M8[ns]')
24592460
elif have_complex:
@@ -2500,7 +2501,7 @@ def _merge_blocks(blocks, items, dtype=None):
25002501
new_ref_locs = [ b._ref_locs for b in blocks ]
25012502
if all([ x is not None for x in new_ref_locs ]):
25022503
new_block.set_ref_locs(np.concatenate(new_ref_locs))
2503-
return new_block
2504+
return new_block
25042505

25052506

25062507
def _block_shape(values, ndim=1, shape=None):

pandas/tests/test_frame.py

+28-5
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,13 @@ def _skip_if_no_scipy():
4545
except ImportError:
4646
raise nose.SkipTest
4747

48-
#-------------------------------------------------------------------------------
48+
#---------------------------------------------------------------------
4949
# DataFrame test cases
5050

5151
JOIN_TYPES = ['inner', 'outer', 'left', 'right']
5252
MIXED_FLOAT_DTYPES = ['float16','float32','float64']
53-
MIXED_INT_DTYPES = ['uint8','uint16','uint32','uint64','int8','int16','int32','int64']
53+
MIXED_INT_DTYPES = ['uint8','uint16','uint32','uint64','int8','int16',
54+
'int32','int64']
5455

5556
def _check_mixed_float(df, dtype = None):
5657

@@ -3188,7 +3189,8 @@ def test_operators_timedelta64(self):
31883189
result = mixed.min(axis=1)
31893190

31903191
# GH 3106
3191-
df = DataFrame({ 'time' : date_range('20130102',periods=5), 'time2' : date_range('20130105',periods=5) })
3192+
df = DataFrame({'time' : date_range('20130102',periods=5),
3193+
'time2' : date_range('20130105',periods=5) })
31923194
df['off1'] = df['time2']-df['time']
31933195
self.assert_(df['off1'].dtype == 'timedelta64[ns]')
31943196

@@ -3197,6 +3199,24 @@ def test_operators_timedelta64(self):
31973199
self.assertTrue(df['off1'].dtype == 'timedelta64[ns]')
31983200
self.assertTrue(df['off2'].dtype == 'timedelta64[ns]')
31993201

3202+
def test__slice_consolidate_invalidate_item_cache(self):
3203+
# #3970
3204+
df = DataFrame({ "aa":range(5), "bb":[2.2]*5})
3205+
3206+
# Creates a second float block
3207+
df["cc"] = 0.0
3208+
3209+
# caches a reference to the 'bb' series
3210+
df["bb"]
3211+
3212+
# repr machinery triggers consolidation
3213+
repr(df)
3214+
3215+
# Assignment to wrong series
3216+
df['bb'].iloc[0] = 0.17
3217+
df._clear_item_cache()
3218+
self.assertAlmostEqual(df['bb'][0], 0.17)
3219+
32003220
def test_new_empty_index(self):
32013221
df1 = DataFrame(randn(0, 3))
32023222
df2 = DataFrame(randn(0, 3))
@@ -7514,7 +7534,7 @@ def _safe_add(df):
75147534
def is_ok(s):
75157535
return issubclass(s.dtype.type, (np.integer,np.floating)) and s.dtype != 'uint8'
75167536
return DataFrame(dict([ (c,s+1) if is_ok(s) else (c,s) for c, s in df.iteritems() ]))
7517-
7537+
75187538
def _check_get(df, cond, check_dtypes = True):
75197539
other1 = _safe_add(df)
75207540
rs = df.where(cond, other1)
@@ -8483,7 +8503,10 @@ def test_sort_datetimes(self):
84838503
df = DataFrame(['a','a','a','b','c','d','e','f','g'],
84848504
columns=['A'],
84858505
index=date_range('20130101',periods=9))
8486-
dts = [ Timestamp(x) for x in ['2004-02-11','2004-01-21','2004-01-26','2005-09-20','2010-10-04','2009-05-12','2008-11-12','2010-09-28','2010-09-28'] ]
8506+
dts = [Timestamp(x)
8507+
for x in ['2004-02-11','2004-01-21','2004-01-26',
8508+
'2005-09-20','2010-10-04','2009-05-12',
8509+
'2008-11-12','2010-09-28','2010-09-28']]
84878510
df['B'] = dts[::2] + dts[1::2]
84888511
df['C'] = 2.
84898512
df['A1'] = 3.

0 commit comments

Comments
 (0)