API: Add equals method to NDFrames. (Implemented with array_equivalent, which is similar to np.array_equal except that it handles object arrays and treats NaNs in corresponding locations as equal.

unutbu · unutbu · commit 9f26fbc915e2 · 2014-01-24T16:00:21.000-05:00
TST: Add tests for NDFrame.equals and BlockManager.equals

DOC: Mention the equals method in basics, release and v.0.13.1
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -215,6 +215,14 @@ These operations produce a pandas object the same type as the left-hand-side inp
 that if of dtype ``bool``. These ``boolean`` objects can be used in indexing operations,
 see :ref:`here<indexing.boolean>`
 
+As of v0.13.1, Series, DataFrames and Panels have an equals method to compare if
+two such objects are equal.
+
+.. ipython:: python
+
+   df.equals(df)
+   df.equals(df2)
+
 .. _basics.reductions:
 
 Boolean Reductions
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -66,6 +66,7 @@ API Changes
     timedeltas (:issue:`5458`,:issue:`5689`)
   - Add ``-NaN`` and ``-nan`` to the default set of NA values
     (:issue:`5952`).  See :ref:`NA Values <io.na_values>`.
+  - ``NDFrame`` now has an ``equals`` method. (:issue:`5283`) 
 
 Experimental Features
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/v0.13.1.txt b/doc/source/v0.13.1.txt
@@ -42,6 +42,21 @@ API changes
 
 - Add ``-NaN`` and ``-nan`` to the default set of NA values (:issue:`5952`).
   See :ref:`NA Values <io.na_values>`.
+- Added the ``NDFrame.equals()`` method to compare if two NDFrames are
+  equal have equal axes, dtypes, and values. Added the
+  ``array_equivalent`` function to compare if two ndarrays are
+  equal. NaNs in identical locations are treated as
+  equal. (:issue:`5283`)
+
+  .. ipython:: python
+
+      df = DataFrame({'col':['foo', 0, np.nan]}).sort()
+      df2 = DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0])
+      df.equals(df)
+
+      import pandas.core.common as com
+      com.array_equivalent(np.array([0, np.nan]), np.array([0, np.nan]))
+      np.array_equal(np.array([0, np.nan]), np.array([0, np.nan]))
 
 Prior Version Deprecations/Changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -277,6 +277,42 @@ def notnull(obj):
     return -res
 
 
+def array_equivalent(left, right):
+    """
+    True if two arrays, left and right, have equal non-NaN elements, and NaNs in
+    corresponding locations.  False otherwise. It is assumed that left and right
+    are NumPy arrays of the same dtype. The behavior of this function
+    (particularly with respect to NaNs) is not defined if the dtypes are
+    different.
+
+    Parameters
+    ----------
+    left, right : array_like
+        Input arrays.
+
+    Returns
+    -------
+    b : bool
+        Returns True if the arrays are equivalent.
+
+    Examples
+    --------
+    >>> array_equivalent([1, 2, nan], np.array([1, 2, nan]))
+    True
+    >>> array_equivalent([1, nan, 2], [1, 2, nan])
+    False
+    """
+    if left.shape != right.shape: return False
+    # NaNs occur only in object arrays, float or complex arrays.
+    if left.dtype == np.object_:
+        # If object array, we need to use pd.isnull
+        return ((left == right) | pd.isnull(left) & pd.isnull(right)).all()
+    elif not issubclass(left.dtype.type, (np.floating, np.complexfloating)):
+        # if not a float or complex array, then there are no NaNs
+        return np.array_equal(left, right)
+    # For float or complex arrays, using np.isnan is faster than pd.isnull
+    return  ((left == right) | (np.isnan(left) & np.isnan(right))).all()
+
 def _iterable_not_string(x):
     return (isinstance(x, collections.Iterable) and
             not isinstance(x, compat.string_types))
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -608,6 +608,15 @@ def __invert__(self):
         arr = operator.inv(_values_from_object(self))
         return self._wrap_array(arr, self.axes, copy=False)
 
+    def equals(self, other):
+        """
+        Determines if two NDFrame objects contain the same elements. NaNs in the
+        same location are considered equal.
+        """
+        if not isinstance(other, self._constructor):
+            return False
+        return self._data.equals(other._data)
+            
     #----------------------------------------------------------------------
     # Iteration
 
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -1121,13 +1121,24 @@ def func(c, v, o):
 
         return result_blocks
 
+    def equals(self, other):
+        if self.dtype != other.dtype or self.shape != other.shape: return False
+        return np.array_equal(self.values, other.values)
+
 
 class NumericBlock(Block):
     is_numeric = True
     _can_hold_na = True
 
 
-class FloatBlock(NumericBlock):
+class FloatOrComplexBlock(NumericBlock):
+    def equals(self, other):
+        if self.dtype != other.dtype or self.shape != other.shape: return False
+        left, right = self.values, other.values
+        return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
+
+
+class FloatBlock(FloatOrComplexBlock):
     is_float = True
     _downcast_dtype = 'int64'
 
@@ -1166,8 +1177,7 @@ def should_store(self, value):
         return (issubclass(value.dtype.type, np.floating) and
                 value.dtype == self.dtype)
 
-
-class ComplexBlock(NumericBlock):
+class ComplexBlock(FloatOrComplexBlock):
     is_complex = True
 
     def _can_hold_element(self, element):
@@ -2563,7 +2573,7 @@ def get_data(self, copy=False, columns=None, **kwargs):
         return self.combine(blocks)
 
     def combine(self, blocks):
-        """ reutrn a new manager with the blocks """
+        """ return a new manager with the blocks """
         indexer = np.sort(np.concatenate([b.ref_locs for b in blocks]))
         new_items = self.items.take(indexer)
 
@@ -3491,6 +3501,16 @@ def item_dtypes(self):
             raise AssertionError('Some items were not in any block')
         return result
 
+    def equals(self, other):
+        self_axes, other_axes = self.axes, other.axes
+        if len(self_axes) != len(other_axes):
+           return False
+        if not all (ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)):
+            return False
+        self._consolidate_inplace()
+        other._consolidate_inplace()
+        return all(block.equals(oblock) for block, oblock in
+                   zip(self.blocks, other.blocks))
 
 class SingleBlockManager(BlockManager):
 
@@ -4004,6 +4024,9 @@ def _merge_blocks(blocks, items, dtype=None, _can_consolidate=True):
                 raise AssertionError("_merge_blocks are invalid!")
             dtype = blocks[0].dtype
 
+        if not items.is_unique:
+            blocks = sorted(blocks, key=lambda b: b.ref_locs.tolist())
+
         new_values = _vstack([b.values for b in blocks], dtype)
         new_items = blocks[0].items.append([b.items for b in blocks[1:]])
         new_block = make_block(new_values, new_items, items)
diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py
@@ -5,11 +5,10 @@
 from nose.tools import assert_equal
 import numpy as np
 from pandas.tslib import iNaT, NaT
-
 from pandas import Series, DataFrame, date_range, DatetimeIndex, Timestamp
 from pandas import compat
 from pandas.compat import range, long, lrange, lmap, u
-from pandas.core.common import notnull, isnull
+from pandas.core.common import notnull, isnull, array_equivalent
 import pandas.core.common as com
 import pandas.util.testing as tm
 import pandas.core.config as cf
@@ -167,6 +166,19 @@ def test_downcast_conv():
         result = com._possibly_downcast_to_dtype(arr,'infer')
         tm.assert_almost_equal(result, expected)
 
+
+def test_array_equivalent():
+    assert array_equivalent(np.array([np.nan, np.nan]), np.array([np.nan, np.nan]))
+    assert array_equivalent(np.array([np.nan, 1, np.nan]), np.array([np.nan, 1, np.nan]))
+    assert array_equivalent(np.array([np.nan, None], dtype='object'),
+                            np.array([np.nan, None], dtype='object'))
+    assert array_equivalent(np.array([np.nan, 1+1j], dtype='complex'),
+                            np.array([np.nan, 1+1j], dtype='complex'))
+    assert not array_equivalent(np.array([np.nan, 1+1j], dtype='complex'),
+                                np.array([np.nan, 1+2j], dtype='complex'))
+    assert not array_equivalent(np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan]))
+    assert not array_equivalent(np.array(['a', 'b', 'c', 'd']), np.array(['e', 'e']))
+
 def test_datetimeindex_from_empty_datetime64_array():
     for unit in [ 'ms', 'us', 'ns' ]:
         idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit))
diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py
@@ -812,7 +812,8 @@ def test_metadata_propagation_indiv(self):
         self.check_metadata(df,result)
 
         # resample
-        df = DataFrame(np.random.randn(1000,2), index=date_range('20130101',periods=1000,freq='s'))
+        df = DataFrame(np.random.randn(1000,2),
+                       index=date_range('20130101',periods=1000,freq='s'))
         result = df.resample('1T')
         self.check_metadata(df,result)
 
@@ -851,6 +852,80 @@ def test_squeeze(self):
         p4d = tm.makePanel4D().reindex(labels=['label1'],items=['ItemA'])
         tm.assert_frame_equal(p4d.squeeze(),p4d.ix['label1','ItemA'])
 
+    def test_equals(self):
+        s1 = pd.Series([1, 2, 3], index=[0, 2, 1])
+        s2 = s1.copy()
+        self.assert_(s1.equals(s2))
+
+        s1[1] = 99
+        self.assert_(not s1.equals(s2))
+
+        # NaNs compare as equal
+        s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3])
+        s2 = s1.copy()
+        self.assert_(s1.equals(s2))
+
+        s2[0] = 9.9
+        self.assert_(not s1.equals(s2))
+        
+        idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')])
+        s1 = Series([1, 2, np.nan], index=idx)
+        s2 = s1.copy()
+        self.assert_(s1.equals(s2))
+
+        # Add object dtype column with nans
+        index = np.random.random(10)
+        df1 = DataFrame(np.random.random(10,), index=index, columns=['floats'])
+        df1['text'] = 'the sky is so blue. we could use more chocolate.'.split()
+        df1['start'] = date_range('2000-1-1', periods=10, freq='T')
+        df1['end'] = date_range('2000-1-1', periods=10, freq='D')
+        df1['diff'] = df1['end'] - df1['start']
+        df1['bool'] = (np.arange(10) % 3 == 0)
+        df1.ix[::2] = nan
+        df2 = df1.copy()
+        self.assert_(df1['text'].equals(df2['text']))
+        self.assert_(df1['start'].equals(df2['start']))
+        self.assert_(df1['end'].equals(df2['end']))
+        self.assert_(df1['diff'].equals(df2['diff']))
+        self.assert_(df1['bool'].equals(df2['bool']))
+        self.assert_(df1.equals(df2))
+        self.assert_(not df1.equals(object))
+
+        # different dtype
+        different = df1.copy()
+        different['floats'] = different['floats'].astype('float32')
+        self.assert_(not df1.equals(different)) 
+
+        # different index
+        different_index = -index
+        different = df2.set_index(different_index)
+        self.assert_(not df1.equals(different))        
+
+        # different columns
+        different = df2.copy()
+        different.columns = df2.columns[::-1]
+        self.assert_(not df1.equals(different))        
+
+        # DatetimeIndex
+        index = pd.date_range('2000-1-1', periods=10, freq='T')
+        df1 = df1.set_index(index)
+        df2 = df1.copy()
+        self.assert_(df1.equals(df2))
+
+        # MultiIndex
+        df3 = df1.set_index(['text'], append=True)
+        df2 = df1.set_index(['text'], append=True)
+        self.assert_(df3.equals(df2))
+
+        df2 = df1.set_index(['floats'], append=True)
+        self.assert_(not df3.equals(df2))
+
+        # NaN in index
+        df3 = df1.set_index(['floats'], append=True)
+        df2 = df1.set_index(['floats'], append=True)
+        self.assert_(df3.equals(df2))
+
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)
diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py
@@ -584,6 +584,27 @@ def test_missing_unicode_key(self):
         except KeyError:
             pass  # this is the expected exception
 
+    def test_equals(self):
+        # unique items
+        index = Index(list('abcdef'))
+        block1 = make_block(np.arange(12).reshape(3,4), list('abc'), index)
+        block2 = make_block(np.arange(12).reshape(3,4)*10, list('def'), index)
+        block1.ref_items = block2.ref_items = index
+        bm1 = BlockManager([block1, block2], [index, np.arange(block1.shape[1])])
+        bm2 = BlockManager([block2, block1], [index, np.arange(block1.shape[1])])
+        self.assert_(bm1.equals(bm2))
+
+        # non-unique items
+        index = Index(list('aaabbb'))
+        block1 = make_block(np.arange(12).reshape(3,4), list('aaa'), index,
+                            placement=[0,1,2])
+        block2 = make_block(np.arange(12).reshape(3,4)*10, list('bbb'), index,
+                            placement=[3,4,5])
+        block1.ref_items = block2.ref_items = index
+        bm1 = BlockManager([block1, block2], [index, np.arange(block1.shape[1])])
+        bm2 = BlockManager([block2, block1], [index, np.arange(block1.shape[1])])
+        self.assert_(bm1.equals(bm2))
+        
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
@@ -436,6 +436,7 @@ def isiterable(obj):
 def is_sorted(seq):
     return assert_almost_equal(seq, np.sort(np.array(seq)))
 
+# This could be refactored to use the NDFrame.equals method
 def assert_series_equal(left, right, check_dtype=True,
                         check_index_type=False,
                         check_series_type=False,
@@ -455,7 +456,7 @@ def assert_series_equal(left, right, check_dtype=True,
         assert_attr_equal('dtype', left.index, right.index)
         assert_attr_equal('inferred_type', left.index, right.index)
 
-
+# This could be refactored to use the NDFrame.equals method
 def assert_frame_equal(left, right, check_dtype=True,
                        check_index_type=False,
                        check_column_type=False,
diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py
@@ -370,3 +370,36 @@ def f(K=100):
 frame_dtypes = Benchmark('df.dtypes', setup,
                          start_date=datetime(2012,1,1))
 
+#----------------------------------------------------------------------
+# equals
+setup = common_setup + """
+def make_pair(name):
+    df = globals()[name]
+    df2 = df.copy()
+    df2.ix[-1,-1] = np.nan
+    return df, df2
+
+def test_equal(name):
+    df, df2 = pairs[name]
+    return df.equals(df)
+
+def test_unequal(name):
+    df, df2 = pairs[name]
+    return df.equals(df2)
+    
+float_df = DataFrame(np.random.randn(1000, 1000))
+object_df = DataFrame([['foo']*1000]*1000)
+nonunique_cols = object_df.copy()
+nonunique_cols.columns = ['A']*len(nonunique_cols.columns)
+
+pairs = dict([(name,make_pair(name))
+         for name in ('float_df', 'object_df', 'nonunique_cols')])
+"""
+frame_float_equal = Benchmark('test_equal("float_df")', setup)
+frame_object_equal = Benchmark('test_equal("object_df")', setup)
+frame_nonunique_equal = Benchmark('test_equal("nonunique_cols")', setup)
+
+frame_float_unequal = Benchmark('test_unequal("float_df")', setup)
+frame_object_unequal = Benchmark('test_unequal("object_df")', setup)
+frame_nonunique_unequal = Benchmark('test_unequal("nonunique_cols")', setup)
+