From 18f25e4e65a91cacc95cdfe620f64377107ebe68 Mon Sep 17 00:00:00 2001 From: dsm054 Date: Sat, 28 Mar 2015 18:35:51 -0400 Subject: [PATCH] BUG: DataFrame.equals should not care about block order (GH #9330) --- doc/source/whatsnew/v0.16.1.txt | 1 + pandas/core/internals.py | 14 +++++++++++++- pandas/io/tests/test_pytables.py | 15 +++++++++++++++ pandas/tests/test_frame.py | 14 ++++++++++++++ pandas/tests/test_internals.py | 25 ++++++++++++++++++++++--- 5 files changed, 65 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 8c49e2780ed06..52b57529fc6c2 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -70,6 +70,7 @@ Bug Fixes - Bug in ``transform`` causing length mismatch when null entries were present and a fast aggregator was being used (:issue:`9697`) +- Bug in ``equals`` causing false negatives when block order differed (:issue:`9330`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 7a16fb2b6b0d7..9b2d366bfb2be 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3310,8 +3310,20 @@ def equals(self, other): return False self._consolidate_inplace() other._consolidate_inplace() + if len(self.blocks) != len(other.blocks): + return False + + # canonicalize block order, using a tuple combining the type + # name and then mgr_locs because there might be unconsolidated + # blocks (say, Categorical) which can only be distinguished by + # the iteration order + def canonicalize(block): + return (block.dtype.name, block.mgr_locs.as_array.tolist()) + + self_blocks = sorted(self.blocks, key=canonicalize) + other_blocks = sorted(other.blocks, key=canonicalize) return all(block.equals(oblock) for block, oblock in - zip(self.blocks, other.blocks)) + zip(self_blocks, other_blocks)) class SingleBlockManager(BlockManager): diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index a15149e341f4d..66b2f4d30f6b6 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4586,9 +4586,24 @@ def test_duplicate_column_name(self): df.to_hdf(path, 'df', format='table') other = read_hdf(path, 'df') + tm.assert_frame_equal(df, other) + self.assertTrue(df.equals(other)) + self.assertTrue(other.equals(df)) + + def test_round_trip_equals(self): + # GH 9330 + df = DataFrame({"B": [1,2], "A": ["x","y"]}) + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', format='table') + other = read_hdf(path, 'df') + tm.assert_frame_equal(df, other) + self.assertTrue(df.equals(other)) + self.assertTrue(other.equals(df)) + + def _test_sort(obj): if isinstance(obj, DataFrame): return obj.reindex(sorted(obj.index)) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 1acad4cf978a8..0b365a3399e0b 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -5944,6 +5944,20 @@ def test_boolean_comparison(self): self.assertRaises(ValueError, lambda : df == (2,2)) self.assertRaises(ValueError, lambda : df == [2,2]) + def test_equals_different_blocks(self): + # GH 9330 + df0 = pd.DataFrame({"A": ["x","y"], "B": [1,2], + "C": ["w","z"]}) + df1 = df0.reset_index()[["A","B","C"]] + # this assert verifies that the above operations have + # induced a block rearrangement + self.assertTrue(df0._data.blocks[0].dtype != + df1._data.blocks[0].dtype) + # do the real tests + self.assert_frame_equal(df0, df1) + self.assertTrue(df0.equals(df1)) + self.assertTrue(df1.equals(df0)) + def test_to_csv_from_csv(self): pname = '__tmp_to_csv_from_csv__' diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 45f089f5e0a53..36585abd1b98f 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -68,15 +68,15 @@ def create_block(typestr, placement, item_shape=None, num_offset=0): elif typestr in ('object', 'string', 'O'): values = np.reshape(['A%d' % i for i in mat.ravel() + num_offset], shape) - elif typestr in ('bool'): + elif typestr in ('b','bool',): values = np.ones(shape, dtype=np.bool_) elif typestr in ('datetime', 'dt', 'M8[ns]'): values = (mat * 1e9).astype('M8[ns]') elif typestr in ('timedelta', 'td', 'm8[ns]'): values = (mat * 1).astype('m8[ns]') - elif typestr in ('category'): + elif typestr in ('category',): values = Categorical([1,1,2,2,3,3,3,3,4,4]) - elif typestr in ('category2'): + elif typestr in ('category2',): values = Categorical(['a','a','a','a','b','b','c','c','c','d']) elif typestr in ('sparse', 'sparse_na'): # FIXME: doesn't support num_rows != 10 @@ -751,6 +751,25 @@ def test_equals(self): bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) self.assertTrue(bm1.equals(bm2)) + def test_equals_block_order_different_dtypes(self): + # GH 9330 + + mgr_strings = [ + "a:i8;b:f8", # basic case + "a:i8;b:f8;c:c8;d:b", # many types + "a:i8;e:dt;f:td;g:string", # more types + "a:i8;b:category;c:category2;d:category2", # categories + "c:sparse;d:sparse_na;b:f8", # sparse + ] + + for mgr_string in mgr_strings: + bm = create_mgr(mgr_string) + block_perms = itertools.permutations(bm.blocks) + for bm_perm in block_perms: + bm_this = BlockManager(bm_perm, bm.axes) + self.assertTrue(bm.equals(bm_this)) + self.assertTrue(bm_this.equals(bm)) + def test_single_mgr_ctor(self): mgr = create_single_mgr('f8', num_rows=5) self.assertEqual(mgr.as_matrix().tolist(), [0., 1., 2., 3., 4.])