Skip to content

Commit e9179fe

Browse files
dsm054jreback
authored andcommitted
BUG: DataFrame.equals should not care about block order (GH #9330)
1 parent 2af2044 commit e9179fe

File tree

5 files changed

+66
-6
lines changed

5 files changed

+66
-6
lines changed

doc/source/whatsnew/v0.16.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ Bug Fixes
7171

7272
- Bug in ``transform`` causing length mismatch when null entries were present and a fast aggregator was being used (:issue:`9697`)
7373

74+
- Bug in ``equals`` causing false negatives when block order differed (:issue:`9330`)
7475

7576
- Bug in ``DataFrame`` slicing may not retain metadata (:issue:`9776`)
7677
- Bug where ``TimdeltaIndex`` were not properly serialized in fixed ``HDFStore`` (:issue:`9635`)

pandas/core/internals.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -3310,8 +3310,20 @@ def equals(self, other):
33103310
return False
33113311
self._consolidate_inplace()
33123312
other._consolidate_inplace()
3313+
if len(self.blocks) != len(other.blocks):
3314+
return False
3315+
3316+
# canonicalize block order, using a tuple combining the type
3317+
# name and then mgr_locs because there might be unconsolidated
3318+
# blocks (say, Categorical) which can only be distinguished by
3319+
# the iteration order
3320+
def canonicalize(block):
3321+
return (block.dtype.name, block.mgr_locs.as_array.tolist())
3322+
3323+
self_blocks = sorted(self.blocks, key=canonicalize)
3324+
other_blocks = sorted(other.blocks, key=canonicalize)
33133325
return all(block.equals(oblock) for block, oblock in
3314-
zip(self.blocks, other.blocks))
3326+
zip(self_blocks, other_blocks))
33153327

33163328

33173329
class SingleBlockManager(BlockManager):

pandas/io/tests/test_pytables.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -4584,19 +4584,33 @@ def test_duplicate_column_name(self):
45844584
with ensure_clean_path(self.path) as path:
45854585
self.assertRaises(ValueError, df.to_hdf, path, 'df', format='fixed')
45864586

4587+
df.to_hdf(path, 'df', format='table')
4588+
other = read_hdf(path, 'df')
4589+
4590+
tm.assert_frame_equal(df, other)
4591+
self.assertTrue(df.equals(other))
4592+
self.assertTrue(other.equals(df))
4593+
4594+
def test_round_trip_equals(self):
4595+
# GH 9330
4596+
df = DataFrame({"B": [1,2], "A": ["x","y"]})
4597+
4598+
with ensure_clean_path(self.path) as path:
45874599
df.to_hdf(path, 'df', format='table')
45884600
other = read_hdf(path, 'df')
45894601
tm.assert_frame_equal(df, other)
4602+
self.assertTrue(df.equals(other))
4603+
self.assertTrue(other.equals(df))
45904604

45914605
def test_preserve_timedeltaindex_type(self):
4592-
# GH9635
4606+
# GH9635
45934607
# Storing TimedeltaIndexed DataFrames in fixed stores did not preserve
45944608
# the type of the index.
45954609
df = DataFrame(np.random.normal(size=(10,5)))
45964610
df.index = timedelta_range(start='0s',periods=10,freq='1s',name='example')
45974611

45984612
with ensure_clean_store(self.path) as store:
4599-
4613+
46004614
store['df'] = df
46014615
assert_frame_equal(store['df'], df)
46024616

pandas/tests/test_frame.py

+14
Original file line numberDiff line numberDiff line change
@@ -5944,6 +5944,20 @@ def test_boolean_comparison(self):
59445944
self.assertRaises(ValueError, lambda : df == (2,2))
59455945
self.assertRaises(ValueError, lambda : df == [2,2])
59465946

5947+
def test_equals_different_blocks(self):
5948+
# GH 9330
5949+
df0 = pd.DataFrame({"A": ["x","y"], "B": [1,2],
5950+
"C": ["w","z"]})
5951+
df1 = df0.reset_index()[["A","B","C"]]
5952+
# this assert verifies that the above operations have
5953+
# induced a block rearrangement
5954+
self.assertTrue(df0._data.blocks[0].dtype !=
5955+
df1._data.blocks[0].dtype)
5956+
# do the real tests
5957+
self.assert_frame_equal(df0, df1)
5958+
self.assertTrue(df0.equals(df1))
5959+
self.assertTrue(df1.equals(df0))
5960+
59475961
def test_to_csv_from_csv(self):
59485962

59495963
pname = '__tmp_to_csv_from_csv__'

pandas/tests/test_internals.py

+22-3
Original file line numberDiff line numberDiff line change
@@ -68,15 +68,15 @@ def create_block(typestr, placement, item_shape=None, num_offset=0):
6868
elif typestr in ('object', 'string', 'O'):
6969
values = np.reshape(['A%d' % i for i in mat.ravel() + num_offset],
7070
shape)
71-
elif typestr in ('bool'):
71+
elif typestr in ('b','bool',):
7272
values = np.ones(shape, dtype=np.bool_)
7373
elif typestr in ('datetime', 'dt', 'M8[ns]'):
7474
values = (mat * 1e9).astype('M8[ns]')
7575
elif typestr in ('timedelta', 'td', 'm8[ns]'):
7676
values = (mat * 1).astype('m8[ns]')
77-
elif typestr in ('category'):
77+
elif typestr in ('category',):
7878
values = Categorical([1,1,2,2,3,3,3,3,4,4])
79-
elif typestr in ('category2'):
79+
elif typestr in ('category2',):
8080
values = Categorical(['a','a','a','a','b','b','c','c','c','d'])
8181
elif typestr in ('sparse', 'sparse_na'):
8282
# FIXME: doesn't support num_rows != 10
@@ -751,6 +751,25 @@ def test_equals(self):
751751
bm2 = BlockManager(bm1.blocks[::-1], bm1.axes)
752752
self.assertTrue(bm1.equals(bm2))
753753

754+
def test_equals_block_order_different_dtypes(self):
755+
# GH 9330
756+
757+
mgr_strings = [
758+
"a:i8;b:f8", # basic case
759+
"a:i8;b:f8;c:c8;d:b", # many types
760+
"a:i8;e:dt;f:td;g:string", # more types
761+
"a:i8;b:category;c:category2;d:category2", # categories
762+
"c:sparse;d:sparse_na;b:f8", # sparse
763+
]
764+
765+
for mgr_string in mgr_strings:
766+
bm = create_mgr(mgr_string)
767+
block_perms = itertools.permutations(bm.blocks)
768+
for bm_perm in block_perms:
769+
bm_this = BlockManager(bm_perm, bm.axes)
770+
self.assertTrue(bm.equals(bm_this))
771+
self.assertTrue(bm_this.equals(bm))
772+
754773
def test_single_mgr_ctor(self):
755774
mgr = create_single_mgr('f8', num_rows=5)
756775
self.assertEqual(mgr.as_matrix().tolist(), [0., 1., 2., 3., 4.])

0 commit comments

Comments
 (0)