From 18f25e4e65a91cacc95cdfe620f64377107ebe68 Mon Sep 17 00:00:00 2001
From: dsm054 <dsm054@gmail.com>
Date: Sat, 28 Mar 2015 18:35:51 -0400
Subject: [PATCH] BUG: DataFrame.equals should not care about block order (GH
 #9330)

---
 doc/source/whatsnew/v0.16.1.txt  |  1 +
 pandas/core/internals.py         | 14 +++++++++++++-
 pandas/io/tests/test_pytables.py | 15 +++++++++++++++
 pandas/tests/test_frame.py       | 14 ++++++++++++++
 pandas/tests/test_internals.py   | 25 ++++++++++++++++++++++---
 5 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
index 8c49e2780ed06..52b57529fc6c2 100644
--- a/doc/source/whatsnew/v0.16.1.txt
+++ b/doc/source/whatsnew/v0.16.1.txt
@@ -70,6 +70,7 @@ Bug Fixes
 
 - Bug in ``transform`` causing length mismatch when null entries were present and a fast aggregator was being used (:issue:`9697`)
 
+- Bug in ``equals`` causing false negatives when block order differed (:issue:`9330`)
 
 
 
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
index 7a16fb2b6b0d7..9b2d366bfb2be 100644
--- a/pandas/core/internals.py
+++ b/pandas/core/internals.py
@@ -3310,8 +3310,20 @@ def equals(self, other):
             return False
         self._consolidate_inplace()
         other._consolidate_inplace()
+        if len(self.blocks) != len(other.blocks):
+            return False
+
+        # canonicalize block order, using a tuple combining the type
+        # name and then mgr_locs because there might be unconsolidated
+        # blocks (say, Categorical) which can only be distinguished by
+        # the iteration order
+        def canonicalize(block):
+            return (block.dtype.name, block.mgr_locs.as_array.tolist())
+
+        self_blocks = sorted(self.blocks, key=canonicalize)
+        other_blocks = sorted(other.blocks, key=canonicalize)
         return all(block.equals(oblock) for block, oblock in
-                   zip(self.blocks, other.blocks))
+                   zip(self_blocks, other_blocks))
 
 
 class SingleBlockManager(BlockManager):
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
index a15149e341f4d..66b2f4d30f6b6 100644
--- a/pandas/io/tests/test_pytables.py
+++ b/pandas/io/tests/test_pytables.py
@@ -4586,9 +4586,24 @@ def test_duplicate_column_name(self):
 
             df.to_hdf(path, 'df', format='table')
             other = read_hdf(path, 'df')
+
             tm.assert_frame_equal(df, other)
+            self.assertTrue(df.equals(other))
+            self.assertTrue(other.equals(df))
+
+    def test_round_trip_equals(self):
+        # GH 9330
+        df = DataFrame({"B": [1,2], "A": ["x","y"]})
 
+        with ensure_clean_path(self.path) as path:
+            df.to_hdf(path, 'df', format='table')
+            other = read_hdf(path, 'df')
+            tm.assert_frame_equal(df, other)
+            self.assertTrue(df.equals(other))
+            self.assertTrue(other.equals(df))
 
+        
+        
 def _test_sort(obj):
     if isinstance(obj, DataFrame):
         return obj.reindex(sorted(obj.index))
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
index 1acad4cf978a8..0b365a3399e0b 100644
--- a/pandas/tests/test_frame.py
+++ b/pandas/tests/test_frame.py
@@ -5944,6 +5944,20 @@ def test_boolean_comparison(self):
         self.assertRaises(ValueError, lambda : df == (2,2))
         self.assertRaises(ValueError, lambda : df == [2,2])
 
+    def test_equals_different_blocks(self):
+        # GH 9330
+        df0 = pd.DataFrame({"A": ["x","y"], "B": [1,2], 
+                            "C": ["w","z"]})
+        df1 = df0.reset_index()[["A","B","C"]]
+        # this assert verifies that the above operations have 
+        # induced a block rearrangement
+        self.assertTrue(df0._data.blocks[0].dtype != 
+                        df1._data.blocks[0].dtype)
+        # do the real tests
+        self.assert_frame_equal(df0, df1)
+        self.assertTrue(df0.equals(df1))
+        self.assertTrue(df1.equals(df0))
+        
     def test_to_csv_from_csv(self):
 
         pname = '__tmp_to_csv_from_csv__'
diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py
index 45f089f5e0a53..36585abd1b98f 100644
--- a/pandas/tests/test_internals.py
+++ b/pandas/tests/test_internals.py
@@ -68,15 +68,15 @@ def create_block(typestr, placement, item_shape=None, num_offset=0):
     elif typestr in ('object', 'string', 'O'):
         values = np.reshape(['A%d' % i for i in mat.ravel() + num_offset],
                             shape)
-    elif typestr in ('bool'):
+    elif typestr in ('b','bool',):
         values = np.ones(shape, dtype=np.bool_)
     elif typestr in ('datetime', 'dt', 'M8[ns]'):
         values = (mat * 1e9).astype('M8[ns]')
     elif typestr in ('timedelta', 'td', 'm8[ns]'):
         values = (mat * 1).astype('m8[ns]')
-    elif typestr in ('category'):
+    elif typestr in ('category',):
         values = Categorical([1,1,2,2,3,3,3,3,4,4])
-    elif typestr in ('category2'):
+    elif typestr in ('category2',):
         values = Categorical(['a','a','a','a','b','b','c','c','c','d'])
     elif typestr in ('sparse', 'sparse_na'):
         # FIXME: doesn't support num_rows != 10
@@ -751,6 +751,25 @@ def test_equals(self):
         bm2 = BlockManager(bm1.blocks[::-1], bm1.axes)
         self.assertTrue(bm1.equals(bm2))
 
+    def test_equals_block_order_different_dtypes(self):
+        # GH 9330
+        
+        mgr_strings = [ 
+            "a:i8;b:f8", # basic case
+            "a:i8;b:f8;c:c8;d:b", # many types
+            "a:i8;e:dt;f:td;g:string", # more types
+            "a:i8;b:category;c:category2;d:category2", # categories
+            "c:sparse;d:sparse_na;b:f8", # sparse
+            ]
+        
+        for mgr_string in mgr_strings:
+            bm = create_mgr(mgr_string)
+            block_perms = itertools.permutations(bm.blocks)
+            for bm_perm in block_perms:
+                bm_this = BlockManager(bm_perm, bm.axes)
+                self.assertTrue(bm.equals(bm_this))
+                self.assertTrue(bm_this.equals(bm))
+
     def test_single_mgr_ctor(self):
         mgr = create_single_mgr('f8', num_rows=5)
         self.assertEqual(mgr.as_matrix().tolist(), [0., 1., 2., 3., 4.])