diff --git a/doc/source/basics.rst b/doc/source/basics.rst index bd2980c2f1c9f..e9cc03c098d03 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -215,6 +215,14 @@ These operations produce a pandas object the same type as the left-hand-side inp that if of dtype ``bool``. These ``boolean`` objects can be used in indexing operations, see :ref:`here` +As of v0.13.1, Series, DataFrames and Panels have an equals method to compare if +two such objects are equal. + +.. ipython:: python + + df.equals(df) + df.equals(df2) + .. _basics.reductions: Boolean Reductions diff --git a/doc/source/release.rst b/doc/source/release.rst index a69c0f8acaa46..78b94f14daf54 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -66,6 +66,7 @@ API Changes timedeltas (:issue:`5458`,:issue:`5689`) - Add ``-NaN`` and ``-nan`` to the default set of NA values (:issue:`5952`). See :ref:`NA Values `. + - ``NDFrame`` now has an ``equals`` method. (:issue:`5283`) Experimental Features ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.13.1.txt b/doc/source/v0.13.1.txt index 3efe79ce281db..1740d21a08466 100644 --- a/doc/source/v0.13.1.txt +++ b/doc/source/v0.13.1.txt @@ -42,6 +42,21 @@ API changes - Add ``-NaN`` and ``-nan`` to the default set of NA values (:issue:`5952`). See :ref:`NA Values `. +- Added the ``NDFrame.equals()`` method to compare if two NDFrames are + equal have equal axes, dtypes, and values. Added the + ``array_equivalent`` function to compare if two ndarrays are + equal. NaNs in identical locations are treated as + equal. (:issue:`5283`) + + .. ipython:: python + + df = DataFrame({'col':['foo', 0, np.nan]}).sort() + df2 = DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0]) + df.equals(df) + + import pandas.core.common as com + com.array_equivalent(np.array([0, np.nan]), np.array([0, np.nan])) + np.array_equal(np.array([0, np.nan]), np.array([0, np.nan])) Prior Version Deprecations/Changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/common.py b/pandas/core/common.py index 5b585c44ca3b8..c3c038fa0945c 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -277,6 +277,42 @@ def notnull(obj): return -res +def array_equivalent(left, right): + """ + True if two arrays, left and right, have equal non-NaN elements, and NaNs in + corresponding locations. False otherwise. It is assumed that left and right + are NumPy arrays of the same dtype. The behavior of this function + (particularly with respect to NaNs) is not defined if the dtypes are + different. + + Parameters + ---------- + left, right : array_like + Input arrays. + + Returns + ------- + b : bool + Returns True if the arrays are equivalent. + + Examples + -------- + >>> array_equivalent([1, 2, nan], np.array([1, 2, nan])) + True + >>> array_equivalent([1, nan, 2], [1, 2, nan]) + False + """ + if left.shape != right.shape: return False + # NaNs occur only in object arrays, float or complex arrays. + if left.dtype == np.object_: + # If object array, we need to use pd.isnull + return ((left == right) | pd.isnull(left) & pd.isnull(right)).all() + elif not issubclass(left.dtype.type, (np.floating, np.complexfloating)): + # if not a float or complex array, then there are no NaNs + return np.array_equal(left, right) + # For float or complex arrays, using np.isnan is faster than pd.isnull + return ((left == right) | (np.isnan(left) & np.isnan(right))).all() + def _iterable_not_string(x): return (isinstance(x, collections.Iterable) and not isinstance(x, compat.string_types)) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bd53e1a35e166..2c03d16fc5cbe 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -608,6 +608,15 @@ def __invert__(self): arr = operator.inv(_values_from_object(self)) return self._wrap_array(arr, self.axes, copy=False) + def equals(self, other): + """ + Determines if two NDFrame objects contain the same elements. NaNs in the + same location are considered equal. + """ + if not isinstance(other, self._constructor): + return False + return self._data.equals(other._data) + #---------------------------------------------------------------------- # Iteration diff --git a/pandas/core/internals.py b/pandas/core/internals.py index dacab4fd6e6c6..bd85391abe8a7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1121,13 +1121,24 @@ def func(c, v, o): return result_blocks + def equals(self, other): + if self.dtype != other.dtype or self.shape != other.shape: return False + return np.array_equal(self.values, other.values) + class NumericBlock(Block): is_numeric = True _can_hold_na = True -class FloatBlock(NumericBlock): +class FloatOrComplexBlock(NumericBlock): + def equals(self, other): + if self.dtype != other.dtype or self.shape != other.shape: return False + left, right = self.values, other.values + return ((left == right) | (np.isnan(left) & np.isnan(right))).all() + + +class FloatBlock(FloatOrComplexBlock): is_float = True _downcast_dtype = 'int64' @@ -1166,8 +1177,7 @@ def should_store(self, value): return (issubclass(value.dtype.type, np.floating) and value.dtype == self.dtype) - -class ComplexBlock(NumericBlock): +class ComplexBlock(FloatOrComplexBlock): is_complex = True def _can_hold_element(self, element): @@ -2563,7 +2573,7 @@ def get_data(self, copy=False, columns=None, **kwargs): return self.combine(blocks) def combine(self, blocks): - """ reutrn a new manager with the blocks """ + """ return a new manager with the blocks """ indexer = np.sort(np.concatenate([b.ref_locs for b in blocks])) new_items = self.items.take(indexer) @@ -3491,6 +3501,16 @@ def item_dtypes(self): raise AssertionError('Some items were not in any block') return result + def equals(self, other): + self_axes, other_axes = self.axes, other.axes + if len(self_axes) != len(other_axes): + return False + if not all (ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): + return False + self._consolidate_inplace() + other._consolidate_inplace() + return all(block.equals(oblock) for block, oblock in + zip(self.blocks, other.blocks)) class SingleBlockManager(BlockManager): @@ -4004,6 +4024,9 @@ def _merge_blocks(blocks, items, dtype=None, _can_consolidate=True): raise AssertionError("_merge_blocks are invalid!") dtype = blocks[0].dtype + if not items.is_unique: + blocks = sorted(blocks, key=lambda b: b.ref_locs.tolist()) + new_values = _vstack([b.values for b in blocks], dtype) new_items = blocks[0].items.append([b.items for b in blocks[1:]]) new_block = make_block(new_values, new_items, items) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index cdc188e52935b..45c9bf5d8c374 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -5,11 +5,10 @@ from nose.tools import assert_equal import numpy as np from pandas.tslib import iNaT, NaT - from pandas import Series, DataFrame, date_range, DatetimeIndex, Timestamp from pandas import compat from pandas.compat import range, long, lrange, lmap, u -from pandas.core.common import notnull, isnull +from pandas.core.common import notnull, isnull, array_equivalent import pandas.core.common as com import pandas.util.testing as tm import pandas.core.config as cf @@ -167,6 +166,19 @@ def test_downcast_conv(): result = com._possibly_downcast_to_dtype(arr,'infer') tm.assert_almost_equal(result, expected) + +def test_array_equivalent(): + assert array_equivalent(np.array([np.nan, np.nan]), np.array([np.nan, np.nan])) + assert array_equivalent(np.array([np.nan, 1, np.nan]), np.array([np.nan, 1, np.nan])) + assert array_equivalent(np.array([np.nan, None], dtype='object'), + np.array([np.nan, None], dtype='object')) + assert array_equivalent(np.array([np.nan, 1+1j], dtype='complex'), + np.array([np.nan, 1+1j], dtype='complex')) + assert not array_equivalent(np.array([np.nan, 1+1j], dtype='complex'), + np.array([np.nan, 1+2j], dtype='complex')) + assert not array_equivalent(np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan])) + assert not array_equivalent(np.array(['a', 'b', 'c', 'd']), np.array(['e', 'e'])) + def test_datetimeindex_from_empty_datetime64_array(): for unit in [ 'ms', 'us', 'ns' ]: idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit)) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 2d72a596dc769..60c51c09915f3 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -812,7 +812,8 @@ def test_metadata_propagation_indiv(self): self.check_metadata(df,result) # resample - df = DataFrame(np.random.randn(1000,2), index=date_range('20130101',periods=1000,freq='s')) + df = DataFrame(np.random.randn(1000,2), + index=date_range('20130101',periods=1000,freq='s')) result = df.resample('1T') self.check_metadata(df,result) @@ -851,6 +852,80 @@ def test_squeeze(self): p4d = tm.makePanel4D().reindex(labels=['label1'],items=['ItemA']) tm.assert_frame_equal(p4d.squeeze(),p4d.ix['label1','ItemA']) + def test_equals(self): + s1 = pd.Series([1, 2, 3], index=[0, 2, 1]) + s2 = s1.copy() + self.assert_(s1.equals(s2)) + + s1[1] = 99 + self.assert_(not s1.equals(s2)) + + # NaNs compare as equal + s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3]) + s2 = s1.copy() + self.assert_(s1.equals(s2)) + + s2[0] = 9.9 + self.assert_(not s1.equals(s2)) + + idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')]) + s1 = Series([1, 2, np.nan], index=idx) + s2 = s1.copy() + self.assert_(s1.equals(s2)) + + # Add object dtype column with nans + index = np.random.random(10) + df1 = DataFrame(np.random.random(10,), index=index, columns=['floats']) + df1['text'] = 'the sky is so blue. we could use more chocolate.'.split() + df1['start'] = date_range('2000-1-1', periods=10, freq='T') + df1['end'] = date_range('2000-1-1', periods=10, freq='D') + df1['diff'] = df1['end'] - df1['start'] + df1['bool'] = (np.arange(10) % 3 == 0) + df1.ix[::2] = nan + df2 = df1.copy() + self.assert_(df1['text'].equals(df2['text'])) + self.assert_(df1['start'].equals(df2['start'])) + self.assert_(df1['end'].equals(df2['end'])) + self.assert_(df1['diff'].equals(df2['diff'])) + self.assert_(df1['bool'].equals(df2['bool'])) + self.assert_(df1.equals(df2)) + self.assert_(not df1.equals(object)) + + # different dtype + different = df1.copy() + different['floats'] = different['floats'].astype('float32') + self.assert_(not df1.equals(different)) + + # different index + different_index = -index + different = df2.set_index(different_index) + self.assert_(not df1.equals(different)) + + # different columns + different = df2.copy() + different.columns = df2.columns[::-1] + self.assert_(not df1.equals(different)) + + # DatetimeIndex + index = pd.date_range('2000-1-1', periods=10, freq='T') + df1 = df1.set_index(index) + df2 = df1.copy() + self.assert_(df1.equals(df2)) + + # MultiIndex + df3 = df1.set_index(['text'], append=True) + df2 = df1.set_index(['text'], append=True) + self.assert_(df3.equals(df2)) + + df2 = df1.set_index(['floats'], append=True) + self.assert_(not df3.equals(df2)) + + # NaN in index + df3 = df1.set_index(['floats'], append=True) + df2 = df1.set_index(['floats'], append=True) + self.assert_(df3.equals(df2)) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 701b240479a62..27860b738d161 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -584,6 +584,27 @@ def test_missing_unicode_key(self): except KeyError: pass # this is the expected exception + def test_equals(self): + # unique items + index = Index(list('abcdef')) + block1 = make_block(np.arange(12).reshape(3,4), list('abc'), index) + block2 = make_block(np.arange(12).reshape(3,4)*10, list('def'), index) + block1.ref_items = block2.ref_items = index + bm1 = BlockManager([block1, block2], [index, np.arange(block1.shape[1])]) + bm2 = BlockManager([block2, block1], [index, np.arange(block1.shape[1])]) + self.assert_(bm1.equals(bm2)) + + # non-unique items + index = Index(list('aaabbb')) + block1 = make_block(np.arange(12).reshape(3,4), list('aaa'), index, + placement=[0,1,2]) + block2 = make_block(np.arange(12).reshape(3,4)*10, list('bbb'), index, + placement=[3,4,5]) + block1.ref_items = block2.ref_items = index + bm1 = BlockManager([block1, block2], [index, np.arange(block1.shape[1])]) + bm2 = BlockManager([block2, block1], [index, np.arange(block1.shape[1])]) + self.assert_(bm1.equals(bm2)) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 7b12467575f78..80e33eb1717da 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -436,6 +436,7 @@ def isiterable(obj): def is_sorted(seq): return assert_almost_equal(seq, np.sort(np.array(seq))) +# This could be refactored to use the NDFrame.equals method def assert_series_equal(left, right, check_dtype=True, check_index_type=False, check_series_type=False, @@ -455,7 +456,7 @@ def assert_series_equal(left, right, check_dtype=True, assert_attr_equal('dtype', left.index, right.index) assert_attr_equal('inferred_type', left.index, right.index) - +# This could be refactored to use the NDFrame.equals method def assert_frame_equal(left, right, check_dtype=True, check_index_type=False, check_column_type=False, diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py index c3425389684ae..e658ce75247b4 100644 --- a/vb_suite/frame_methods.py +++ b/vb_suite/frame_methods.py @@ -370,3 +370,36 @@ def f(K=100): frame_dtypes = Benchmark('df.dtypes', setup, start_date=datetime(2012,1,1)) +#---------------------------------------------------------------------- +# equals +setup = common_setup + """ +def make_pair(name): + df = globals()[name] + df2 = df.copy() + df2.ix[-1,-1] = np.nan + return df, df2 + +def test_equal(name): + df, df2 = pairs[name] + return df.equals(df) + +def test_unequal(name): + df, df2 = pairs[name] + return df.equals(df2) + +float_df = DataFrame(np.random.randn(1000, 1000)) +object_df = DataFrame([['foo']*1000]*1000) +nonunique_cols = object_df.copy() +nonunique_cols.columns = ['A']*len(nonunique_cols.columns) + +pairs = dict([(name,make_pair(name)) + for name in ('float_df', 'object_df', 'nonunique_cols')]) +""" +frame_float_equal = Benchmark('test_equal("float_df")', setup) +frame_object_equal = Benchmark('test_equal("object_df")', setup) +frame_nonunique_equal = Benchmark('test_equal("nonunique_cols")', setup) + +frame_float_unequal = Benchmark('test_unequal("float_df")', setup) +frame_object_unequal = Benchmark('test_unequal("object_df")', setup) +frame_nonunique_unequal = Benchmark('test_unequal("nonunique_cols")', setup) +