diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 349e7e25fdafb..648013b393e1e 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -240,14 +240,14 @@ way to summarize a boolean result. .. ipython:: python - (df>0).all() - (df>0).any() + (df > 0).all() + (df > 0).any() You can reduce to a final boolean value. .. ipython:: python - (df>0).any().any() + (df > 0).any().any() You can test if a pandas object is empty, via the :attr:`~DataFrame.empty` property. @@ -330,6 +330,48 @@ equality to be True: df1.equals(df2) df1.equals(df2.sort()) +Comparing array-like objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can conveniently do element-wise comparisons when comparing a pandas +data structure with a scalar value: + +.. ipython:: python + + pd.Series(['foo', 'bar', 'baz']) == 'foo' + pd.Index(['foo', 'bar', 'baz']) == 'foo' + +Pandas also handles element-wise comparisons between different array-like +objects of the same length: + +.. ipython:: python + + pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux']) + pd.Series(['foo', 'bar', 'baz']) == np.array(['foo', 'bar', 'qux']) + +Trying to compare ``Index`` or ``Series`` objects of different lengths will +raise a ValueError: + +.. code-block:: python + + In [55]: pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo', 'bar']) + ValueError: Series lengths must match to compare + + In [56]: pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo']) + ValueError: Series lengths must match to compare + +Note that this is different from the numpy behavior where a comparison can +be broadcast: + +.. ipython:: python + + np.array([1, 2, 3]) == np.array([2]) + +or it can return False if broadcasting can not be done: + +.. ipython:: python + + np.array([1, 2, 3]) == np.array([1, 2]) Combining overlapping data sets ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index a3ec13439fe76..eecc9cdbcc131 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -46,6 +46,76 @@ Backwards incompatible API changes .. _whatsnew_0170.api_breaking: +- Operator equal on Index should behavior similarly to Series (:issue:`9947`) + +Starting in v0.17.0, comparing ``Index`` objects of different lengths will raise +a ``ValueError``. This is to be consistent with the behavior of ``Series``. + +Previous behavior: + +.. code-block:: python + + In [2]: pd.Index([1, 2, 3]) == pd.Index([1, 4, 5]) + Out[2]: array([ True, False, False], dtype=bool) + + In [3]: pd.Index([1, 2, 3]) == pd.Index([2]) + Out[3]: array([False, True, False], dtype=bool) + + In [4]: pd.Index([1, 2, 3]) == pd.Index([1, 2]) + Out[4]: False + + In [5]: pd.Series([1, 2, 3]) == pd.Series([1, 4, 5]) + Out[5]: + 0 True + 1 False + 2 False + dtype: bool + + In [6]: pd.Series([1, 2, 3]) == pd.Series([2]) + ValueError: Series lengths must match to compare + + In [7]: pd.Series([1, 2, 3]) == pd.Series([1, 2]) + ValueError: Series lengths must match to compare + +New behavior: + +.. code-block:: python + + In [8]: pd.Index([1, 2, 3]) == pd.Index([1, 4, 5]) + Out[8]: array([ True, False, False], dtype=bool) + + In [9]: pd.Index([1, 2, 3]) == pd.Index([2]) + ValueError: Lengths must match to compare + + In [10]: pd.Index([1, 2, 3]) == pd.Index([1, 2]) + ValueError: Lengths must match to compare + + In [11]: pd.Series([1, 2, 3]) == pd.Series([1, 4, 5]) + Out[11]: + 0 True + 1 False + 2 False + dtype: bool + + In [12]: pd.Series([1, 2, 3]) == pd.Series([2]) + ValueError: Series lengths must match to compare + + In [13]: pd.Series([1, 2, 3]) == pd.Series([1, 2]) + ValueError: Series lengths must match to compare + +Note that this is different from the ``numpy`` behavior where a comparison can +be broadcast: + +.. ipython:: python + + np.array([1, 2, 3]) == np.array([1]) + +or it can return False if broadcasting can not be done: + +.. ipython:: python + + np.array([1, 2, 3]) == np.array([1, 2]) + .. _whatsnew_0170.api_breaking.other: Other API Changes @@ -149,3 +219,4 @@ Bug Fixes - Bug in ``Series.plot(kind='hist')`` Y Label not informative (:issue:`10485`) +- Bug in operator equal on Index not being consistent with Series (:issue:`9947`) diff --git a/pandas/core/index.py b/pandas/core/index.py index 4e69eeb600ecb..7047f07280012 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -2593,6 +2593,9 @@ def _add_comparison_methods(cls): def _make_compare(op): def _evaluate_compare(self, other): + if isinstance(other, (np.ndarray, Index, ABCSeries)): + if other.ndim > 0 and len(self) != len(other): + raise ValueError('Lengths must match to compare') func = getattr(self.values, op) result = func(np.asarray(other)) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 32f20337c109a..3d901837f5123 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -1550,22 +1550,70 @@ def test_groupby(self): tm.assert_dict_equal(groups, exp) def test_equals_op(self): - # For issue #9785 + # GH9947 index_a = Index(['foo', 'bar', 'baz']) index_b = Index(['foo', 'bar', 'baz', 'qux']) - # Testing Numpy Results Equivelent - assert_array_equal( - index_a.equals(index_a), - index_a == index_a - ) - assert_array_equal( - index_a.equals(index_b), - index_a == index_b, - ) - assert_array_equal( - index_b.equals(index_a), - index_b == index_a, - ) + index_c = Index(['foo', 'bar', 'qux']) + index_d = Index(['foo']) + with tm.assertRaisesRegexp(ValueError, "Lengths must match"): + index_a == index_b + assert_array_equal(index_a == index_a, np.array([True, True, True])) + assert_array_equal(index_a == index_c, np.array([True, True, False])) + + # test comparisons with numpy arrays + array_a = np.array(['foo', 'bar', 'baz']) + array_b = np.array(['foo', 'bar', 'baz', 'qux']) + array_c = np.array(['foo', 'bar', 'qux']) + array_d = np.array(['foo']) + with tm.assertRaisesRegexp(ValueError, "Lengths must match"): + index_a == array_b + assert_array_equal(index_a == array_a, np.array([True, True, True])) + assert_array_equal(index_a == array_c, np.array([True, True, False])) + + # test comparisons with Series + series_a = Series(['foo', 'bar', 'baz']) + series_b = Series(['foo', 'bar', 'baz', 'qux']) + series_c = Series(['foo', 'bar', 'qux']) + series_d = Series(['foo']) + with tm.assertRaisesRegexp(ValueError, "Lengths must match"): + index_a == series_b + assert_array_equal(index_a == series_a, np.array([True, True, True])) + assert_array_equal(index_a == series_c, np.array([True, True, False])) + + # cases where length is 1 for one of them + with tm.assertRaisesRegexp(ValueError, "Lengths must match"): + index_a == index_d + with tm.assertRaisesRegexp(ValueError, "Lengths must match"): + index_a == series_d + with tm.assertRaisesRegexp(ValueError, "Lengths must match"): + index_a == array_d + with tm.assertRaisesRegexp(ValueError, "Series lengths must match"): + series_a == series_d + with tm.assertRaisesRegexp(ValueError, "Lengths must match"): + series_a == array_d + + # comparing with scalar should broadcast + assert_array_equal(index_a == 'foo', np.array([True, False, False])) + assert_array_equal(series_a == 'foo', np.array([True, False, False])) + assert_array_equal(array_a == 'foo', np.array([True, False, False])) + + # GH9785 + # test comparisons of multiindex + from pandas.compat import StringIO + df = pd.read_csv(StringIO('a,b,c\n1,2,3\n4,5,6'), index_col=[0, 1]) + assert_array_equal(df.index == df.index, np.array([True, True])) + + mi1 = MultiIndex.from_tuples([(1, 2), (4, 5)]) + assert_array_equal(df.index == mi1, np.array([True, True])) + mi2 = MultiIndex.from_tuples([(1, 2), (4, 6)]) + assert_array_equal(df.index == mi2, np.array([True, False])) + mi3 = MultiIndex.from_tuples([(1, 2), (4, 5), (8, 9)]) + with tm.assertRaisesRegexp(ValueError, "Lengths must match"): + df.index == mi3 + with tm.assertRaisesRegexp(ValueError, "Lengths must match"): + df.index == index_a + assert_array_equal(index_a == mi3, np.array([False, False, False])) + class TestCategoricalIndex(Base, tm.TestCase): _holder = CategoricalIndex @@ -4815,47 +4863,9 @@ def test_index_name_retained(self): tm.assert_frame_equal(result, df_expected) def test_equals_operator(self): - # For issue #9785 + # GH9785 self.assertTrue((self.index == self.index).all()) - def test_index_compare(self): - # For issue #9785 - index_unequal = Index(['foo', 'bar', 'baz']) - index_equal = Index([ - ('foo', 'one'), ('foo', 'two'), ('bar', 'one'), - ('baz', 'two'), ('qux', 'one'), ('qux', 'two') - ], tupleize_cols=False) - # Testing Numpy Results Equivelent - assert_array_equal( - index_unequal.equals(self.index), - index_unequal == self.index, - err_msg = 'Index compared with MultiIndex failed', - ) - assert_array_equal( - self.index.equals(index_unequal), - self.index == index_unequal, - err_msg = 'MultiIndex compared with Index failed', - ) - assert_array_equal( - self.index.equals(index_equal), - self.index == index_equal, - err_msg = 'MultiIndex compared with Similar Index failed', - ) - assert_array_equal( - index_equal.equals(self.index), - index_equal == self.index, - err_msg = 'Index compared with Similar MultiIndex failed', - ) - # Testing that the result is true for the index_equal case - self.assertTrue( - (self.index == index_equal).all(), - msg='Assert Index compared with Similar MultiIndex match' - ) - self.assertTrue( - (index_equal == self.index).all(), - msg='Assert MultiIndex compared with Similar Index match' - ) - def test_get_combined_index(): from pandas.core.index import _get_combined_index