Skip to content

Commit 9f26fbc

Browse files
committed
API: Add equals method to NDFrames. (Implemented with array_equivalent, which is similar to np.array_equal except that it handles object arrays and treats NaNs in corresponding locations as equal.
TST: Add tests for NDFrame.equals and BlockManager.equals DOC: Mention the equals method in basics, release and v.0.13.1
1 parent c9013b8 commit 9f26fbc

File tree

11 files changed

+242
-8
lines changed

11 files changed

+242
-8
lines changed

doc/source/basics.rst

+8
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,14 @@ These operations produce a pandas object the same type as the left-hand-side inp
215215
that if of dtype ``bool``. These ``boolean`` objects can be used in indexing operations,
216216
see :ref:`here<indexing.boolean>`
217217

218+
As of v0.13.1, Series, DataFrames and Panels have an equals method to compare if
219+
two such objects are equal.
220+
221+
.. ipython:: python
222+
223+
df.equals(df)
224+
df.equals(df2)
225+
218226
.. _basics.reductions:
219227

220228
Boolean Reductions

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ API Changes
6666
timedeltas (:issue:`5458`,:issue:`5689`)
6767
- Add ``-NaN`` and ``-nan`` to the default set of NA values
6868
(:issue:`5952`). See :ref:`NA Values <io.na_values>`.
69+
- ``NDFrame`` now has an ``equals`` method. (:issue:`5283`)
6970

7071
Experimental Features
7172
~~~~~~~~~~~~~~~~~~~~~

doc/source/v0.13.1.txt

+15
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,21 @@ API changes
4242

4343
- Add ``-NaN`` and ``-nan`` to the default set of NA values (:issue:`5952`).
4444
See :ref:`NA Values <io.na_values>`.
45+
- Added the ``NDFrame.equals()`` method to compare if two NDFrames are
46+
equal have equal axes, dtypes, and values. Added the
47+
``array_equivalent`` function to compare if two ndarrays are
48+
equal. NaNs in identical locations are treated as
49+
equal. (:issue:`5283`)
50+
51+
.. ipython:: python
52+
53+
df = DataFrame({'col':['foo', 0, np.nan]}).sort()
54+
df2 = DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0])
55+
df.equals(df)
56+
57+
import pandas.core.common as com
58+
com.array_equivalent(np.array([0, np.nan]), np.array([0, np.nan]))
59+
np.array_equal(np.array([0, np.nan]), np.array([0, np.nan]))
4560

4661
Prior Version Deprecations/Changes
4762
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

pandas/core/common.py

+36
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,42 @@ def notnull(obj):
277277
return -res
278278

279279

280+
def array_equivalent(left, right):
281+
"""
282+
True if two arrays, left and right, have equal non-NaN elements, and NaNs in
283+
corresponding locations. False otherwise. It is assumed that left and right
284+
are NumPy arrays of the same dtype. The behavior of this function
285+
(particularly with respect to NaNs) is not defined if the dtypes are
286+
different.
287+
288+
Parameters
289+
----------
290+
left, right : array_like
291+
Input arrays.
292+
293+
Returns
294+
-------
295+
b : bool
296+
Returns True if the arrays are equivalent.
297+
298+
Examples
299+
--------
300+
>>> array_equivalent([1, 2, nan], np.array([1, 2, nan]))
301+
True
302+
>>> array_equivalent([1, nan, 2], [1, 2, nan])
303+
False
304+
"""
305+
if left.shape != right.shape: return False
306+
# NaNs occur only in object arrays, float or complex arrays.
307+
if left.dtype == np.object_:
308+
# If object array, we need to use pd.isnull
309+
return ((left == right) | pd.isnull(left) & pd.isnull(right)).all()
310+
elif not issubclass(left.dtype.type, (np.floating, np.complexfloating)):
311+
# if not a float or complex array, then there are no NaNs
312+
return np.array_equal(left, right)
313+
# For float or complex arrays, using np.isnan is faster than pd.isnull
314+
return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
315+
280316
def _iterable_not_string(x):
281317
return (isinstance(x, collections.Iterable) and
282318
not isinstance(x, compat.string_types))

pandas/core/generic.py

+9
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,15 @@ def __invert__(self):
608608
arr = operator.inv(_values_from_object(self))
609609
return self._wrap_array(arr, self.axes, copy=False)
610610

611+
def equals(self, other):
612+
"""
613+
Determines if two NDFrame objects contain the same elements. NaNs in the
614+
same location are considered equal.
615+
"""
616+
if not isinstance(other, self._constructor):
617+
return False
618+
return self._data.equals(other._data)
619+
611620
#----------------------------------------------------------------------
612621
# Iteration
613622

pandas/core/internals.py

+27-4
Original file line numberDiff line numberDiff line change
@@ -1121,13 +1121,24 @@ def func(c, v, o):
11211121

11221122
return result_blocks
11231123

1124+
def equals(self, other):
1125+
if self.dtype != other.dtype or self.shape != other.shape: return False
1126+
return np.array_equal(self.values, other.values)
1127+
11241128

11251129
class NumericBlock(Block):
11261130
is_numeric = True
11271131
_can_hold_na = True
11281132

11291133

1130-
class FloatBlock(NumericBlock):
1134+
class FloatOrComplexBlock(NumericBlock):
1135+
def equals(self, other):
1136+
if self.dtype != other.dtype or self.shape != other.shape: return False
1137+
left, right = self.values, other.values
1138+
return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
1139+
1140+
1141+
class FloatBlock(FloatOrComplexBlock):
11311142
is_float = True
11321143
_downcast_dtype = 'int64'
11331144

@@ -1166,8 +1177,7 @@ def should_store(self, value):
11661177
return (issubclass(value.dtype.type, np.floating) and
11671178
value.dtype == self.dtype)
11681179

1169-
1170-
class ComplexBlock(NumericBlock):
1180+
class ComplexBlock(FloatOrComplexBlock):
11711181
is_complex = True
11721182

11731183
def _can_hold_element(self, element):
@@ -2563,7 +2573,7 @@ def get_data(self, copy=False, columns=None, **kwargs):
25632573
return self.combine(blocks)
25642574

25652575
def combine(self, blocks):
2566-
""" reutrn a new manager with the blocks """
2576+
""" return a new manager with the blocks """
25672577
indexer = np.sort(np.concatenate([b.ref_locs for b in blocks]))
25682578
new_items = self.items.take(indexer)
25692579

@@ -3491,6 +3501,16 @@ def item_dtypes(self):
34913501
raise AssertionError('Some items were not in any block')
34923502
return result
34933503

3504+
def equals(self, other):
3505+
self_axes, other_axes = self.axes, other.axes
3506+
if len(self_axes) != len(other_axes):
3507+
return False
3508+
if not all (ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)):
3509+
return False
3510+
self._consolidate_inplace()
3511+
other._consolidate_inplace()
3512+
return all(block.equals(oblock) for block, oblock in
3513+
zip(self.blocks, other.blocks))
34943514

34953515
class SingleBlockManager(BlockManager):
34963516

@@ -4004,6 +4024,9 @@ def _merge_blocks(blocks, items, dtype=None, _can_consolidate=True):
40044024
raise AssertionError("_merge_blocks are invalid!")
40054025
dtype = blocks[0].dtype
40064026

4027+
if not items.is_unique:
4028+
blocks = sorted(blocks, key=lambda b: b.ref_locs.tolist())
4029+
40074030
new_values = _vstack([b.values for b in blocks], dtype)
40084031
new_items = blocks[0].items.append([b.items for b in blocks[1:]])
40094032
new_block = make_block(new_values, new_items, items)

pandas/tests/test_common.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,10 @@
55
from nose.tools import assert_equal
66
import numpy as np
77
from pandas.tslib import iNaT, NaT
8-
98
from pandas import Series, DataFrame, date_range, DatetimeIndex, Timestamp
109
from pandas import compat
1110
from pandas.compat import range, long, lrange, lmap, u
12-
from pandas.core.common import notnull, isnull
11+
from pandas.core.common import notnull, isnull, array_equivalent
1312
import pandas.core.common as com
1413
import pandas.util.testing as tm
1514
import pandas.core.config as cf
@@ -167,6 +166,19 @@ def test_downcast_conv():
167166
result = com._possibly_downcast_to_dtype(arr,'infer')
168167
tm.assert_almost_equal(result, expected)
169168

169+
170+
def test_array_equivalent():
171+
assert array_equivalent(np.array([np.nan, np.nan]), np.array([np.nan, np.nan]))
172+
assert array_equivalent(np.array([np.nan, 1, np.nan]), np.array([np.nan, 1, np.nan]))
173+
assert array_equivalent(np.array([np.nan, None], dtype='object'),
174+
np.array([np.nan, None], dtype='object'))
175+
assert array_equivalent(np.array([np.nan, 1+1j], dtype='complex'),
176+
np.array([np.nan, 1+1j], dtype='complex'))
177+
assert not array_equivalent(np.array([np.nan, 1+1j], dtype='complex'),
178+
np.array([np.nan, 1+2j], dtype='complex'))
179+
assert not array_equivalent(np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan]))
180+
assert not array_equivalent(np.array(['a', 'b', 'c', 'd']), np.array(['e', 'e']))
181+
170182
def test_datetimeindex_from_empty_datetime64_array():
171183
for unit in [ 'ms', 'us', 'ns' ]:
172184
idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit))

pandas/tests/test_generic.py

+76-1
Original file line numberDiff line numberDiff line change
@@ -812,7 +812,8 @@ def test_metadata_propagation_indiv(self):
812812
self.check_metadata(df,result)
813813

814814
# resample
815-
df = DataFrame(np.random.randn(1000,2), index=date_range('20130101',periods=1000,freq='s'))
815+
df = DataFrame(np.random.randn(1000,2),
816+
index=date_range('20130101',periods=1000,freq='s'))
816817
result = df.resample('1T')
817818
self.check_metadata(df,result)
818819

@@ -851,6 +852,80 @@ def test_squeeze(self):
851852
p4d = tm.makePanel4D().reindex(labels=['label1'],items=['ItemA'])
852853
tm.assert_frame_equal(p4d.squeeze(),p4d.ix['label1','ItemA'])
853854

855+
def test_equals(self):
856+
s1 = pd.Series([1, 2, 3], index=[0, 2, 1])
857+
s2 = s1.copy()
858+
self.assert_(s1.equals(s2))
859+
860+
s1[1] = 99
861+
self.assert_(not s1.equals(s2))
862+
863+
# NaNs compare as equal
864+
s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3])
865+
s2 = s1.copy()
866+
self.assert_(s1.equals(s2))
867+
868+
s2[0] = 9.9
869+
self.assert_(not s1.equals(s2))
870+
871+
idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')])
872+
s1 = Series([1, 2, np.nan], index=idx)
873+
s2 = s1.copy()
874+
self.assert_(s1.equals(s2))
875+
876+
# Add object dtype column with nans
877+
index = np.random.random(10)
878+
df1 = DataFrame(np.random.random(10,), index=index, columns=['floats'])
879+
df1['text'] = 'the sky is so blue. we could use more chocolate.'.split()
880+
df1['start'] = date_range('2000-1-1', periods=10, freq='T')
881+
df1['end'] = date_range('2000-1-1', periods=10, freq='D')
882+
df1['diff'] = df1['end'] - df1['start']
883+
df1['bool'] = (np.arange(10) % 3 == 0)
884+
df1.ix[::2] = nan
885+
df2 = df1.copy()
886+
self.assert_(df1['text'].equals(df2['text']))
887+
self.assert_(df1['start'].equals(df2['start']))
888+
self.assert_(df1['end'].equals(df2['end']))
889+
self.assert_(df1['diff'].equals(df2['diff']))
890+
self.assert_(df1['bool'].equals(df2['bool']))
891+
self.assert_(df1.equals(df2))
892+
self.assert_(not df1.equals(object))
893+
894+
# different dtype
895+
different = df1.copy()
896+
different['floats'] = different['floats'].astype('float32')
897+
self.assert_(not df1.equals(different))
898+
899+
# different index
900+
different_index = -index
901+
different = df2.set_index(different_index)
902+
self.assert_(not df1.equals(different))
903+
904+
# different columns
905+
different = df2.copy()
906+
different.columns = df2.columns[::-1]
907+
self.assert_(not df1.equals(different))
908+
909+
# DatetimeIndex
910+
index = pd.date_range('2000-1-1', periods=10, freq='T')
911+
df1 = df1.set_index(index)
912+
df2 = df1.copy()
913+
self.assert_(df1.equals(df2))
914+
915+
# MultiIndex
916+
df3 = df1.set_index(['text'], append=True)
917+
df2 = df1.set_index(['text'], append=True)
918+
self.assert_(df3.equals(df2))
919+
920+
df2 = df1.set_index(['floats'], append=True)
921+
self.assert_(not df3.equals(df2))
922+
923+
# NaN in index
924+
df3 = df1.set_index(['floats'], append=True)
925+
df2 = df1.set_index(['floats'], append=True)
926+
self.assert_(df3.equals(df2))
927+
928+
854929
if __name__ == '__main__':
855930
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
856931
exit=False)

pandas/tests/test_internals.py

+21
Original file line numberDiff line numberDiff line change
@@ -584,6 +584,27 @@ def test_missing_unicode_key(self):
584584
except KeyError:
585585
pass # this is the expected exception
586586

587+
def test_equals(self):
588+
# unique items
589+
index = Index(list('abcdef'))
590+
block1 = make_block(np.arange(12).reshape(3,4), list('abc'), index)
591+
block2 = make_block(np.arange(12).reshape(3,4)*10, list('def'), index)
592+
block1.ref_items = block2.ref_items = index
593+
bm1 = BlockManager([block1, block2], [index, np.arange(block1.shape[1])])
594+
bm2 = BlockManager([block2, block1], [index, np.arange(block1.shape[1])])
595+
self.assert_(bm1.equals(bm2))
596+
597+
# non-unique items
598+
index = Index(list('aaabbb'))
599+
block1 = make_block(np.arange(12).reshape(3,4), list('aaa'), index,
600+
placement=[0,1,2])
601+
block2 = make_block(np.arange(12).reshape(3,4)*10, list('bbb'), index,
602+
placement=[3,4,5])
603+
block1.ref_items = block2.ref_items = index
604+
bm1 = BlockManager([block1, block2], [index, np.arange(block1.shape[1])])
605+
bm2 = BlockManager([block2, block1], [index, np.arange(block1.shape[1])])
606+
self.assert_(bm1.equals(bm2))
607+
587608
if __name__ == '__main__':
588609
import nose
589610
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

pandas/util/testing.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,7 @@ def isiterable(obj):
436436
def is_sorted(seq):
437437
return assert_almost_equal(seq, np.sort(np.array(seq)))
438438

439+
# This could be refactored to use the NDFrame.equals method
439440
def assert_series_equal(left, right, check_dtype=True,
440441
check_index_type=False,
441442
check_series_type=False,
@@ -455,7 +456,7 @@ def assert_series_equal(left, right, check_dtype=True,
455456
assert_attr_equal('dtype', left.index, right.index)
456457
assert_attr_equal('inferred_type', left.index, right.index)
457458

458-
459+
# This could be refactored to use the NDFrame.equals method
459460
def assert_frame_equal(left, right, check_dtype=True,
460461
check_index_type=False,
461462
check_column_type=False,

vb_suite/frame_methods.py

+33
Original file line numberDiff line numberDiff line change
@@ -370,3 +370,36 @@ def f(K=100):
370370
frame_dtypes = Benchmark('df.dtypes', setup,
371371
start_date=datetime(2012,1,1))
372372

373+
#----------------------------------------------------------------------
374+
# equals
375+
setup = common_setup + """
376+
def make_pair(name):
377+
df = globals()[name]
378+
df2 = df.copy()
379+
df2.ix[-1,-1] = np.nan
380+
return df, df2
381+
382+
def test_equal(name):
383+
df, df2 = pairs[name]
384+
return df.equals(df)
385+
386+
def test_unequal(name):
387+
df, df2 = pairs[name]
388+
return df.equals(df2)
389+
390+
float_df = DataFrame(np.random.randn(1000, 1000))
391+
object_df = DataFrame([['foo']*1000]*1000)
392+
nonunique_cols = object_df.copy()
393+
nonunique_cols.columns = ['A']*len(nonunique_cols.columns)
394+
395+
pairs = dict([(name,make_pair(name))
396+
for name in ('float_df', 'object_df', 'nonunique_cols')])
397+
"""
398+
frame_float_equal = Benchmark('test_equal("float_df")', setup)
399+
frame_object_equal = Benchmark('test_equal("object_df")', setup)
400+
frame_nonunique_equal = Benchmark('test_equal("nonunique_cols")', setup)
401+
402+
frame_float_unequal = Benchmark('test_unequal("float_df")', setup)
403+
frame_object_unequal = Benchmark('test_unequal("object_df")', setup)
404+
frame_nonunique_unequal = Benchmark('test_unequal("nonunique_cols")', setup)
405+

0 commit comments

Comments
 (0)