Merge pull request #4657 from jreback/bool

jreback · jreback · commit 5148e90cad68 · 2013-08-31T10:51:58.000-07:00
API: GH4633, bool(obj) behavior, raise on __nonzero__ always
diff --git a/doc/source/10min.rst b/doc/source/10min.rst
@@ -269,7 +269,6 @@ A ``where`` operation for getting.
 
    df[df > 0]
 
-
 Setting
 ~~~~~~~
 
@@ -708,3 +707,20 @@ Reading from an excel file
    :suppress:
 
    os.remove('foo.xlsx')
+
+Gotchas
+-------
+
+If you are trying an operation and you see an exception like:
+
+.. code-block:: python
+
+    >>> if pd.Series([False, True, False]):
+        print("I was true")
+    Traceback
+        ...
+    ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all().
+
+See :ref:`Comparisons<basics.compare>` for an explanation and what to do.
+
+See :ref:`Gotachas<gotchas>` as well.
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -8,7 +8,7 @@
    from pandas import *
    randn = np.random.randn
    np.set_printoptions(precision=4, suppress=True)
-   from pandas.compat import lrange 
+   from pandas.compat import lrange
 
 ==============================
  Essential Basic Functionality
@@ -198,16 +198,62 @@ replace NaN with some other value using ``fillna`` if you wish).
 
 Flexible Comparisons
 ~~~~~~~~~~~~~~~~~~~~
+
+.. _basics.compare:
+
 Starting in v0.8, pandas introduced binary comparison methods eq, ne, lt, gt,
 le, and ge to Series and DataFrame whose behavior is analogous to the binary
 arithmetic operations described above:
 
 .. ipython:: python
 
    df.gt(df2)
-
    df2.ne(df)
 
+These operations produce a pandas object the same type as the left-hand-side input
+that if of dtype ``bool``. These ``boolean`` objects can be used in indexing operations,
+see :ref:`here<indexing.boolean>`
+
+Furthermore, you can apply the reduction functions: ``any()`` and ``all()`` to provide a
+way to summarize these results.
+
+.. ipython:: python
+
+   (df>0).all()
+   (df>0).any()
+
+Finally you can test if a pandas object is empty, via the ``empty`` property.
+
+.. ipython:: python
+
+   df.empty
+   DataFrame(columns=list('ABC')).empty
+
+.. warning::
+
+   You might be tempted to do the following:
+
+   .. code-block:: python
+
+       >>>if df:
+            ...
+
+   Or
+
+   .. code-block:: python
+
+       >>> df and df2
+
+   These both will raise as you are trying to compare multiple values.
+
+   .. code-block:: python
+
+       ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all().
+
+
+See :ref:`gotchas<gotchas.truth>` for a more detailed discussion.
+
+
 Combining overlapping data sets
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst
@@ -15,6 +15,58 @@
 Caveats and Gotchas
 *******************
 
+Using If/Truth Statements with Pandas
+-------------------------------------
+
+.. _gotchas.truth:
+
+Pandas follows the numpy convention of raising an error when you try to convert something to a ``bool``.
+This happens in a ``if`` or when using the boolean operations, ``and``, ``or``, or ``not``.  It is not clear
+what the result of
+
+.. code-block:: python
+
+    >>> if Series([False, True, False]):
+         ...
+
+should be. Should it be ``True`` because it's not zero-length? ``False`` because there are ``False`` values?
+It is unclear, so instead, pandas raises a ``ValueError``:
+
+.. code-block:: python
+
+    >>> if pd.Series([False, True, False]):
+        print("I was true")
+    Traceback
+        ...
+    ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all().
+
+
+If you see that, you need to explicitly choose what you want to do with it (e.g., use `any()`, `all()` or `empty`).
+or, you might want to compare if the pandas object is ``None``
+
+.. code-block:: python
+
+    >>> if pd.Series([False, True, False]) is not None:
+           print("I was not None")
+    >>> I was not None
+
+Bitwise boolean
+~~~~~~~~~~~~~~~
+
+Bitwise boolean operators like ``==`` and ``!=`` will return a boolean ``Series``,
+which is almost always what you want anyways.
+
+.. code-block:: python
+
+   >>> s = pd.Series(range(5))
+   >>> s == 4
+   0    False
+   1    False
+   2    False
+   3    False
+   4     True
+   dtype: bool
+
 ``NaN``, Integer ``NA`` values and ``NA`` type promotions
 ---------------------------------------------------------
 
@@ -428,7 +480,7 @@ parse HTML tables in the top-level pandas io function ``read_html``.
       lxml will work correctly:
 
       .. code-block:: sh
-         
+
          # remove the included version
          conda remove lxml
 
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -134,6 +134,9 @@ pandas 0.13
       now returns a ``MultiIndex`` rather than an ``Index``. (:issue:`4039`)
 
   - Infer and downcast dtype if ``downcast='infer'`` is passed to ``fillna/ffill/bfill`` (:issue:`4604`)
+  - Factored out excel_value_to_python_value from ExcelFile::_parse_excel (:issue:`4589`)
+  - ``__nonzero__`` for all NDFrame objects, will now raise a ``ValueError``, this reverts back to (:issue:`1073`, :issue:`4633`)
+    behavior.
 
 **Internal Refactoring**
 
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
@@ -121,6 +121,18 @@ API changes
         index.set_names(["bob", "cranberry"], inplace=True)
 
   - Infer and downcast dtype if ``downcast='infer'`` is passed to ``fillna/ffill/bfill`` (:issue:`4604`)
+  - ``__nonzero__`` for all NDFrame objects, will now raise a ``ValueError``, this reverts back to (:issue:`1073`, :issue:`4633`)
+    behavior.
+
+    This prevent behaviors like (which will now all raise ``ValueError``)
+
+    ..code-block ::
+
+        if df:
+           ....
+
+        df1 and df2
+        s1 and s2
 
 Enhancements
 ~~~~~~~~~~~~
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -531,7 +531,8 @@ def empty(self):
         return not all(len(self._get_axis(a)) > 0 for a in self._AXIS_ORDERS)
 
     def __nonzero__(self):
-        return not self.empty
+        raise ValueError("The truth value of an array is ambiguous. Use a.empty, a.any() or a.all().")
+
     __bool__ = __nonzero__
 
     #----------------------------------------------------------------------
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -2101,9 +2101,22 @@ def filter(self, func, dropna=True, *args, **kwargs):
             else:
                 res = path(group)
 
-            if res:
+            def add_indexer():
                 indexers.append(self.obj.index.get_indexer(group.index))
 
+            # interpret the result of the filter
+            if isinstance(res,(bool,np.bool_)):
+                if res:
+                    add_indexer()
+            else:
+                if getattr(res,'ndim',None) == 1:
+                    if res.ravel()[0]:
+                        add_indexer()
+                else:
+
+                    # in theory you could do .all() on the boolean result ?
+                    raise TypeError("the filter must return a boolean result")
+
         if len(indexers) == 0:
             filtered = self.obj.take([]) # because np.concatenate would fail
         else:
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -798,13 +798,6 @@ def __contains__(self, key):
     __long__ = _coerce_method(int)
     __int__ = _coerce_method(int)
 
-    def __nonzero__(self):
-        # special case of a single element bool series degenerating to a scalar
-        if self.dtype == np.bool_ and len(self) == 1:
-            return bool(self.iloc[0])
-        return not self.empty
-    __bool__ = __nonzero__
-
     # we are preserving name here
     def __getstate__(self):
         return dict(_data=self._data, name=self.name)
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -1593,19 +1593,19 @@ def test_table_values_dtypes_roundtrip(self):
         with ensure_clean(self.path) as store:
             df1 = DataFrame({'a': [1, 2, 3]}, dtype='f8')
             store.append('df_f8', df1)
-            assert df1.dtypes == store['df_f8'].dtypes
+            assert_series_equal(df1.dtypes,store['df_f8'].dtypes)
 
             df2 = DataFrame({'a': [1, 2, 3]}, dtype='i8')
             store.append('df_i8', df2)
-            assert df2.dtypes == store['df_i8'].dtypes
+            assert_series_equal(df2.dtypes,store['df_i8'].dtypes)
 
             # incompatible dtype
             self.assertRaises(ValueError, store.append, 'df_i8', df1)
 
             # check creation/storage/retrieval of float32 (a bit hacky to actually create them thought)
             df1 = DataFrame(np.array([[1],[2],[3]],dtype='f4'),columns = ['A'])
             store.append('df_f4', df1)
-            assert df1.dtypes == store['df_f4'].dtypes
+            assert_series_equal(df1.dtypes,store['df_f4'].dtypes)
             assert df1.dtypes[0] == 'float32'
 
             # check with mixed dtypes
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -10607,13 +10607,10 @@ def test_index_namedtuple(self):
         df = DataFrame([(1, 2), (3, 4)], index=index, columns=["A", "B"])
         self.assertEqual(df.ix[IndexType("foo", "bar")]["A"], 1)
 
-    def test_bool_empty_nonzero(self):
+    def test_empty_nonzero(self):
         df = DataFrame([1, 2, 3])
-        self.assertTrue(bool(df))
         self.assertFalse(df.empty)
         df = DataFrame(index=['a', 'b'], columns=['c', 'd']).dropna()
-        self.assertFalse(bool(df))
-        self.assertFalse(bool(df.T))
         self.assertTrue(df.empty)
         self.assertTrue(df.T.empty)
 
diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py
@@ -73,7 +73,6 @@ def _construct(self, shape, value=None, **kwargs):
             arr = np.random.randn(*shape)
         return self._typ(arr,**kwargs)
 
-
     def _compare(self, result, expected):
         self._comparator(result,expected)
 
@@ -82,14 +81,14 @@ def test_rename(self):
         # single axis
         for axis in self._axes():
             kwargs = { axis : list('ABCD') }
-            o = self._construct(4,**kwargs)
+            obj = self._construct(4,**kwargs)
 
             # no values passed
             #self.assertRaises(Exception, o.rename(str.lower))
 
             # rename a single axis
-            result = o.rename(**{ axis : str.lower })
-            expected = o.copy()
+            result = obj.rename(**{ axis : str.lower })
+            expected = obj.copy()
             setattr(expected,axis,list('abcd'))
             self._compare(result, expected)
 
@@ -119,6 +118,41 @@ def test_get_numeric_data(self):
         self._compare(result, o)
 
         # _get_numeric_data is includes _get_bool_data, so can't test for non-inclusion
+    def test_nonzero(self):
+
+        # GH 4633
+        # look at the boolean/nonzero behavior for objects
+        obj = self._construct(shape=4)
+        self.assertRaises(ValueError, lambda : bool(obj == 0))
+        self.assertRaises(ValueError, lambda : bool(obj == 1))
+        self.assertRaises(ValueError, lambda : bool(obj))
+
+        obj = self._construct(shape=4,value=1)
+        self.assertRaises(ValueError, lambda : bool(obj == 0))
+        self.assertRaises(ValueError, lambda : bool(obj == 1))
+        self.assertRaises(ValueError, lambda : bool(obj))
+
+        obj = self._construct(shape=4,value=np.nan)
+        self.assertRaises(ValueError, lambda : bool(obj == 0))
+        self.assertRaises(ValueError, lambda : bool(obj == 1))
+        self.assertRaises(ValueError, lambda : bool(obj))
+
+        # empty
+        obj = self._construct(shape=0)
+        self.assertRaises(ValueError, lambda : bool(obj))
+
+        # invalid behaviors
+
+        obj1 = self._construct(shape=4,value=1)
+        obj2 = self._construct(shape=4,value=1)
+
+        def f():
+            if obj1:
+                print("this works and shouldn't")
+        self.assertRaises(ValueError, f)
+        self.assertRaises(ValueError, lambda : obj1 and obj2)
+        self.assertRaises(ValueError, lambda : obj1 or obj2)
+        self.assertRaises(ValueError, lambda : not obj1)
 
 class TestSeries(unittest.TestCase, Generic):
     _typ = Series
@@ -154,6 +188,14 @@ def test_get_numeric_data_preserve_dtype(self):
         expected = Series([],dtype='M8[ns]')
         self._compare(result, expected)
 
+    def test_nonzero_single_element(self):
+
+        s = Series([True])
+        self.assertRaises(ValueError, lambda : bool(s))
+
+        s = Series([False])
+        self.assertRaises(ValueError, lambda : bool(s))
+
 class TestDataFrame(unittest.TestCase, Generic):
     _typ = DataFrame
     _comparator = lambda self, x, y: assert_frame_equal(x,y)
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -296,12 +296,6 @@ def test_scalar_conversion(self):
         self.assert_(int(Series([1.])) == 1)
         self.assert_(long(Series([1.])) == 1)
 
-        self.assert_(bool(Series([True])) == True)
-        self.assert_(bool(Series([False])) == False)
-
-        self.assert_(bool(Series([True,True])) == True)
-        self.assert_(bool(Series([False,True])) == True)
-
     def test_astype(self):
         s = Series(np.random.randn(5),name='foo')
 
diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py
@@ -256,7 +256,7 @@ def test_indexing(self):
         df = DataFrame(randn(5,5),columns=['open','high','low','close','volume'],index=date_range('2012-01-02 18:01:00',periods=5,tz='US/Central',freq='s'))
         expected = df.loc[[df.index[2]]]
         result = df['2012-01-02 18:01:02']
-        self.assert_(result == expected)
+        assert_frame_equal(result,expected)
 
         # this is a single date, so will raise
         self.assertRaises(KeyError, df.__getitem__, df.index[2],)