Skip to content

Commit 18f25e4

Browse files
committed
BUG: DataFrame.equals should not care about block order (GH pandas-dev#9330)
1 parent 7dfb279 commit 18f25e4

File tree

5 files changed

+65
-4
lines changed

5 files changed

+65
-4
lines changed

doc/source/whatsnew/v0.16.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ Bug Fixes
7070

7171
- Bug in ``transform`` causing length mismatch when null entries were present and a fast aggregator was being used (:issue:`9697`)
7272

73+
- Bug in ``equals`` causing false negatives when block order differed (:issue:`9330`)
7374

7475

7576

pandas/core/internals.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -3310,8 +3310,20 @@ def equals(self, other):
33103310
return False
33113311
self._consolidate_inplace()
33123312
other._consolidate_inplace()
3313+
if len(self.blocks) != len(other.blocks):
3314+
return False
3315+
3316+
# canonicalize block order, using a tuple combining the type
3317+
# name and then mgr_locs because there might be unconsolidated
3318+
# blocks (say, Categorical) which can only be distinguished by
3319+
# the iteration order
3320+
def canonicalize(block):
3321+
return (block.dtype.name, block.mgr_locs.as_array.tolist())
3322+
3323+
self_blocks = sorted(self.blocks, key=canonicalize)
3324+
other_blocks = sorted(other.blocks, key=canonicalize)
33133325
return all(block.equals(oblock) for block, oblock in
3314-
zip(self.blocks, other.blocks))
3326+
zip(self_blocks, other_blocks))
33153327

33163328

33173329
class SingleBlockManager(BlockManager):

pandas/io/tests/test_pytables.py

+15
Original file line numberDiff line numberDiff line change
@@ -4586,9 +4586,24 @@ def test_duplicate_column_name(self):
45864586

45874587
df.to_hdf(path, 'df', format='table')
45884588
other = read_hdf(path, 'df')
4589+
45894590
tm.assert_frame_equal(df, other)
4591+
self.assertTrue(df.equals(other))
4592+
self.assertTrue(other.equals(df))
4593+
4594+
def test_round_trip_equals(self):
4595+
# GH 9330
4596+
df = DataFrame({"B": [1,2], "A": ["x","y"]})
45904597

4598+
with ensure_clean_path(self.path) as path:
4599+
df.to_hdf(path, 'df', format='table')
4600+
other = read_hdf(path, 'df')
4601+
tm.assert_frame_equal(df, other)
4602+
self.assertTrue(df.equals(other))
4603+
self.assertTrue(other.equals(df))
45914604

4605+
4606+
45924607
def _test_sort(obj):
45934608
if isinstance(obj, DataFrame):
45944609
return obj.reindex(sorted(obj.index))

pandas/tests/test_frame.py

+14
Original file line numberDiff line numberDiff line change
@@ -5944,6 +5944,20 @@ def test_boolean_comparison(self):
59445944
self.assertRaises(ValueError, lambda : df == (2,2))
59455945
self.assertRaises(ValueError, lambda : df == [2,2])
59465946

5947+
def test_equals_different_blocks(self):
5948+
# GH 9330
5949+
df0 = pd.DataFrame({"A": ["x","y"], "B": [1,2],
5950+
"C": ["w","z"]})
5951+
df1 = df0.reset_index()[["A","B","C"]]
5952+
# this assert verifies that the above operations have
5953+
# induced a block rearrangement
5954+
self.assertTrue(df0._data.blocks[0].dtype !=
5955+
df1._data.blocks[0].dtype)
5956+
# do the real tests
5957+
self.assert_frame_equal(df0, df1)
5958+
self.assertTrue(df0.equals(df1))
5959+
self.assertTrue(df1.equals(df0))
5960+
59475961
def test_to_csv_from_csv(self):
59485962

59495963
pname = '__tmp_to_csv_from_csv__'

pandas/tests/test_internals.py

+22-3
Original file line numberDiff line numberDiff line change
@@ -68,15 +68,15 @@ def create_block(typestr, placement, item_shape=None, num_offset=0):
6868
elif typestr in ('object', 'string', 'O'):
6969
values = np.reshape(['A%d' % i for i in mat.ravel() + num_offset],
7070
shape)
71-
elif typestr in ('bool'):
71+
elif typestr in ('b','bool',):
7272
values = np.ones(shape, dtype=np.bool_)
7373
elif typestr in ('datetime', 'dt', 'M8[ns]'):
7474
values = (mat * 1e9).astype('M8[ns]')
7575
elif typestr in ('timedelta', 'td', 'm8[ns]'):
7676
values = (mat * 1).astype('m8[ns]')
77-
elif typestr in ('category'):
77+
elif typestr in ('category',):
7878
values = Categorical([1,1,2,2,3,3,3,3,4,4])
79-
elif typestr in ('category2'):
79+
elif typestr in ('category2',):
8080
values = Categorical(['a','a','a','a','b','b','c','c','c','d'])
8181
elif typestr in ('sparse', 'sparse_na'):
8282
# FIXME: doesn't support num_rows != 10
@@ -751,6 +751,25 @@ def test_equals(self):
751751
bm2 = BlockManager(bm1.blocks[::-1], bm1.axes)
752752
self.assertTrue(bm1.equals(bm2))
753753

754+
def test_equals_block_order_different_dtypes(self):
755+
# GH 9330
756+
757+
mgr_strings = [
758+
"a:i8;b:f8", # basic case
759+
"a:i8;b:f8;c:c8;d:b", # many types
760+
"a:i8;e:dt;f:td;g:string", # more types
761+
"a:i8;b:category;c:category2;d:category2", # categories
762+
"c:sparse;d:sparse_na;b:f8", # sparse
763+
]
764+
765+
for mgr_string in mgr_strings:
766+
bm = create_mgr(mgr_string)
767+
block_perms = itertools.permutations(bm.blocks)
768+
for bm_perm in block_perms:
769+
bm_this = BlockManager(bm_perm, bm.axes)
770+
self.assertTrue(bm.equals(bm_this))
771+
self.assertTrue(bm_this.equals(bm))
772+
754773
def test_single_mgr_ctor(self):
755774
mgr = create_single_mgr('f8', num_rows=5)
756775
self.assertEqual(mgr.as_matrix().tolist(), [0., 1., 2., 3., 4.])

0 commit comments

Comments
 (0)