API/COMPAT: support axis=None for logical reduction (reduce over all axes) (#21486)

TomAugspurger · jorisvandenbossche · commit f7ed7f8e30e7 · 2018-06-26T09:34:15.000+02:00
* Compat with NumPy 1.15 logical func
* Accepts axis=None as reduce all dims
diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt
@@ -16,6 +16,36 @@ and bug fixes. We recommend that all users upgrade to this version.
     :local:
     :backlinks: none
 
+.. _whatsnew_0232.enhancements:
+
+Logical Reductions over Entire DataFrame
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:meth:`DataFrame.all` and :meth:`DataFrame.any` now accept ``axis=None`` to reduce over all axes to a scalar (:issue:`19976`)
+
+.. ipython:: python
+
+   df = pd.DataFrame({"A": [1, 2], "B": [True, False]})
+   df.all(axis=None)
+
+
+This also provides compatibility with NumPy 1.15, which now dispatches to ``DataFrame.all``.
+With NumPy 1.15 and pandas 0.23.1 or earlier, :func:`numpy.all` will no longer reduce over every axis:
+
+.. code-block:: python
+
+   >>> # NumPy 1.15, pandas 0.23.1
+   >>> np.any(pd.DataFrame({"A": [False], "B": [False]}))
+   A    False
+   B    False
+   dtype: bool
+
+With pandas 0.23.2, that will correctly return False, as it did with NumPy < 1.15.
+
+.. ipython:: python
+
+   np.any(pd.DataFrame({"A": [False], "B": [False]}))
+
 
 .. _whatsnew_0232.fixed_regressions:
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -6846,13 +6846,18 @@ def _count_level(self, level, axis=0, numeric_only=False):
 
     def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
                 filter_type=None, **kwds):
-        axis = self._get_axis_number(axis)
+        if axis is None and filter_type == 'bool':
+            labels = None
+            constructor = None
+        else:
+            # TODO: Make other agg func handle axis=None properly
+            axis = self._get_axis_number(axis)
+            labels = self._get_agg_axis(axis)
+            constructor = self._constructor
 
         def f(x):
             return op(x, axis=axis, skipna=skipna, **kwds)
 
-        labels = self._get_agg_axis(axis)
-
         # exclude timedelta/datetime unless we are uniform types
         if axis == 1 and self._is_mixed_type and self._is_datelike_mixed_type:
             numeric_only = True
@@ -6861,6 +6866,13 @@ def f(x):
             try:
                 values = self.values
                 result = f(values)
+
+                if (filter_type == 'bool' and is_object_dtype(values) and
+                        axis is None):
+                    # work around https://github.com/numpy/numpy/issues/10489
+                    # TODO: combine with hasattr(result, 'dtype') further down
+                    # hard since we don't have `values` down there.
+                    result = np.bool_(result)
             except Exception as e:
 
                 # try by-column first
@@ -6927,7 +6939,9 @@ def f(x):
                 if axis == 0:
                     result = coerce_to_dtypes(result, self.dtypes)
 
-        return Series(result, index=labels)
+        if constructor is not None:
+            result = Series(result, index=labels)
+        return result
 
     def nunique(self, axis=0, dropna=True):
         """
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -8728,6 +8728,8 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None,
         return rs
 
     def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs):
+        if axis is None:
+            raise ValueError("Must specify 'axis' when aggregating by level.")
         grouped = self.groupby(level=level, axis=axis, sort=False)
         if hasattr(grouped, name) and skipna:
             return getattr(grouped, name)(**kwargs)
@@ -9054,8 +9056,15 @@ def _doc_parms(cls):
 
 Parameters
 ----------
-axis : int, default 0
-    Select the axis which can be 0 for indices and 1 for columns.
+axis : {0 or 'index', 1 or 'columns', None}, default 0
+    Indicate which axis or axes should be reduced.
+
+    * 0 / 'index' : reduce the index, return a Series whose index is the
+      original column labels.
+    * 1 / 'columns' : reduce the columns, return a Series whose index is the
+      original index.
+    * None : reduce all axes, return a scalar.
+
 skipna : boolean, default True
     Exclude NA/null values. If an entire row/column is NA, the result
     will be NA.
@@ -9077,9 +9086,9 @@ def _doc_parms(cls):
 %(examples)s"""
 
 _all_doc = """\
-Return whether all elements are True over series or dataframe axis.
+Return whether all elements are True, potentially over an axis.
 
-Returns True if all elements within a series or along a dataframe
+Returns True if all elements within a series or along a Dataframe
 axis are non-zero, not-empty or not-False."""
 
 _all_examples = """\
@@ -9092,7 +9101,7 @@ def _doc_parms(cls):
 >>> pd.Series([True, False]).all()
 False
 
-Dataframes
+DataFrames
 
 Create a dataframe from a dictionary.
 
@@ -9109,12 +9118,17 @@ def _doc_parms(cls):
 col2    False
 dtype: bool
 
-Adding axis=1 argument will check if row-wise values all return True.
+Specify ``axis='columns'`` to check if row-wise values all return True.
 
->>> df.all(axis=1)
+>>> df.all(axis='columns')
 0     True
 1    False
 dtype: bool
+
+Or ``axis=None`` for whether every value is True.
+
+>>> df.all(axis=None)
+False
 """
 
 _all_see_also = """\
@@ -9484,6 +9498,11 @@ def _doc_parms(cls):
 1    False
 dtype: bool
 
+Aggregating over the entire DataFrame with ``axis=None``.
+
+>>> df.any(axis=None)
+True
+
 `any` for an empty DataFrame is an empty Series.
 
 >>> pd.DataFrame([]).any()
@@ -9654,22 +9673,17 @@ def _make_logical_function(cls, name, name1, name2, axis_descr, desc, f,
     @Substitution(outname=name, desc=desc, name1=name1, name2=name2,
                   axis_descr=axis_descr, examples=examples, see_also=see_also)
     @Appender(_bool_doc)
-    def logical_func(self, axis=None, bool_only=None, skipna=None, level=None,
+    def logical_func(self, axis=0, bool_only=None, skipna=True, level=None,
                      **kwargs):
         nv.validate_logical_func(tuple(), kwargs, fname=name)
-        if skipna is None:
-            skipna = True
-        if axis is None:
-            axis = self._stat_axis_number
         if level is not None:
             if bool_only is not None:
                 raise NotImplementedError("Option bool_only is not "
                                           "implemented with option level.")
             return self._agg_by_level(name, axis=axis, level=level,
                                       skipna=skipna)
-        return self._reduce(f, axis=axis, skipna=skipna,
-                            numeric_only=bool_only, filter_type='bool',
-                            name=name)
+        return self._reduce(f, name, axis=axis, skipna=skipna,
+                            numeric_only=bool_only, filter_type='bool')
 
     return set_function_name(logical_func, name, cls)
 
diff --git a/pandas/core/panel.py b/pandas/core/panel.py
@@ -1143,13 +1143,26 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
             raise NotImplementedError('Panel.{0} does not implement '
                                       'numeric_only.'.format(name))
 
-        axis_name = self._get_axis_name(axis)
-        axis_number = self._get_axis_number(axis_name)
+        if axis is None and filter_type == 'bool':
+            # labels = None
+            # constructor = None
+            axis_number = None
+            axis_name = None
+        else:
+            # TODO: Make other agg func handle axis=None properly
+            axis = self._get_axis_number(axis)
+            # labels = self._get_agg_axis(axis)
+            # constructor = self._constructor
+            axis_name = self._get_axis_name(axis)
+            axis_number = self._get_axis_number(axis_name)
+
         f = lambda x: op(x, axis=axis_number, skipna=skipna, **kwds)
 
         with np.errstate(all='ignore'):
             result = f(self.values)
 
+        if axis is None and filter_type == 'bool':
+            return np.bool_(result)
         axes = self._get_plane_axes(axis_name)
         if result.ndim == 2 and axis_name != self._info_axis_name:
             result = result.T
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -3241,7 +3241,8 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
         delegate = self._values
         if isinstance(delegate, np.ndarray):
             # Validate that 'axis' is consistent with Series's single axis.
-            self._get_axis_number(axis)
+            if axis is not None:
+                self._get_axis_number(axis)
             if numeric_only:
                 raise NotImplementedError('Series.{0} does not implement '
                                           'numeric_only.'.format(name))
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -15,7 +15,7 @@
 from pandas.compat import lrange, PY35
 from pandas import (compat, isna, notna, DataFrame, Series,
                     MultiIndex, date_range, Timestamp, Categorical,
-                    _np_version_under1p12, _np_version_under1p15,
+                    _np_version_under1p12,
                     to_datetime, to_timedelta)
 import pandas as pd
 import pandas.core.nanops as nanops
@@ -1159,11 +1159,35 @@ def test_any_all(self):
         self._check_bool_op('any', np.any, has_skipna=True, has_bool_only=True)
         self._check_bool_op('all', np.all, has_skipna=True, has_bool_only=True)
 
-        df = DataFrame(randn(10, 4)) > 0
-        df.any(1)
-        df.all(1)
-        df.any(1, bool_only=True)
-        df.all(1, bool_only=True)
+    def test_any_all_extra(self):
+        df = DataFrame({
+            'A': [True, False, False],
+            'B': [True, True, False],
+            'C': [True, True, True],
+        }, index=['a', 'b', 'c'])
+        result = df[['A', 'B']].any(1)
+        expected = Series([True, True, False], index=['a', 'b', 'c'])
+        tm.assert_series_equal(result, expected)
+
+        result = df[['A', 'B']].any(1, bool_only=True)
+        tm.assert_series_equal(result, expected)
+
+        result = df.all(1)
+        expected = Series([True, False, False], index=['a', 'b', 'c'])
+        tm.assert_series_equal(result, expected)
+
+        result = df.all(1, bool_only=True)
+        tm.assert_series_equal(result, expected)
+
+        # Axis is None
+        result = df.all(axis=None).item()
+        assert result is False
+
+        result = df.any(axis=None).item()
+        assert result is True
+
+        result = df[['C']].all(axis=None).item()
+        assert result is True
 
         # skip pathological failure cases
         # class CantNonzero(object):
@@ -1185,6 +1209,86 @@ def test_any_all(self):
         # df.any(1, bool_only=True)
         # df.all(1, bool_only=True)
 
+    @pytest.mark.parametrize('func, data, expected', [
+        (np.any, {}, False),
+        (np.all, {}, True),
+        (np.any, {'A': []}, False),
+        (np.all, {'A': []}, True),
+        (np.any, {'A': [False, False]}, False),
+        (np.all, {'A': [False, False]}, False),
+        (np.any, {'A': [True, False]}, True),
+        (np.all, {'A': [True, False]}, False),
+        (np.any, {'A': [True, True]}, True),
+        (np.all, {'A': [True, True]}, True),
+
+        (np.any, {'A': [False], 'B': [False]}, False),
+        (np.all, {'A': [False], 'B': [False]}, False),
+
+        (np.any, {'A': [False, False], 'B': [False, True]}, True),
+        (np.all, {'A': [False, False], 'B': [False, True]}, False),
+
+        # other types
+        (np.all, {'A': pd.Series([0.0, 1.0], dtype='float')}, False),
+        (np.any, {'A': pd.Series([0.0, 1.0], dtype='float')}, True),
+        (np.all, {'A': pd.Series([0, 1], dtype=int)}, False),
+        (np.any, {'A': pd.Series([0, 1], dtype=int)}, True),
+        pytest.param(np.all, {'A': pd.Series([0, 1], dtype='M8[ns]')}, False,
+                     marks=[td.skip_if_np_lt_115]),
+        pytest.param(np.any, {'A': pd.Series([0, 1], dtype='M8[ns]')}, True,
+                     marks=[td.skip_if_np_lt_115]),
+        pytest.param(np.all, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True,
+                     marks=[td.skip_if_np_lt_115]),
+        pytest.param(np.any, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True,
+                     marks=[td.skip_if_np_lt_115]),
+        pytest.param(np.all, {'A': pd.Series([0, 1], dtype='m8[ns]')}, False,
+                     marks=[td.skip_if_np_lt_115]),
+        pytest.param(np.any, {'A': pd.Series([0, 1], dtype='m8[ns]')}, True,
+                     marks=[td.skip_if_np_lt_115]),
+        pytest.param(np.all, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True,
+                     marks=[td.skip_if_np_lt_115]),
+        pytest.param(np.any, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True,
+                     marks=[td.skip_if_np_lt_115]),
+        (np.all, {'A': pd.Series([0, 1], dtype='category')}, False),
+        (np.any, {'A': pd.Series([0, 1], dtype='category')}, True),
+        (np.all, {'A': pd.Series([1, 2], dtype='category')}, True),
+        (np.any, {'A': pd.Series([1, 2], dtype='category')}, True),
+
+        # # Mix
+        # GH-21484
+        # (np.all, {'A': pd.Series([10, 20], dtype='M8[ns]'),
+        #           'B': pd.Series([10, 20], dtype='m8[ns]')}, True),
+    ])
+    def test_any_all_np_func(self, func, data, expected):
+        # https://github.com/pandas-dev/pandas/issues/19976
+        data = DataFrame(data)
+        result = func(data)
+        assert isinstance(result, np.bool_)
+        assert result.item() is expected
+
+        # method version
+        result = getattr(DataFrame(data), func.__name__)(axis=None)
+        assert isinstance(result, np.bool_)
+        assert result.item() is expected
+
+    def test_any_all_object(self):
+        # https://github.com/pandas-dev/pandas/issues/19976
+        result = np.all(DataFrame(columns=['a', 'b'])).item()
+        assert result is True
+
+        result = np.any(DataFrame(columns=['a', 'b'])).item()
+        assert result is False
+
+    @pytest.mark.parametrize('method', ['any', 'all'])
+    def test_any_all_level_axis_none_raises(self, method):
+        df = DataFrame(
+            {"A": 1},
+            index=MultiIndex.from_product([['A', 'B'], ['a', 'b']],
+                                          names=['out', 'in'])
+        )
+        xpr = "Must specify 'axis' when aggregating by level."
+        with tm.assert_raises_regex(ValueError, xpr):
+            getattr(df, method)(axis=None, level='out')
+
     def _check_bool_op(self, name, alternative, frame=None, has_skipna=True,
                        has_bool_only=False):
         if frame is None:
@@ -2091,9 +2195,6 @@ def test_clip_against_list_like(self, inplace, lower, axis, res):
             result = original
         tm.assert_frame_equal(result, expected, check_exact=True)
 
-    @pytest.mark.xfail(
-        not _np_version_under1p15,
-        reason="failing under numpy-dev gh-19976")
     @pytest.mark.parametrize("axis", [0, 1, None])
     def test_clip_against_frame(self, axis):
         df = DataFrame(np.random.randn(1000, 2))
diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py
@@ -2707,3 +2707,10 @@ def test_panel_index():
                                        np.repeat([1, 2, 3], 4)],
                                       names=['time', 'panel'])
     tm.assert_index_equal(index, expected)
+
+
+def test_panel_np_all():
+    with catch_warnings(record=True):
+        wp = Panel({"A": DataFrame({'b': [1, 2]})})
+    result = np.all(wp)
+    assert result == np.bool_(True)
diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py