[ArrayManager] Test DataFrame reductions + implement ignore_failures (#39719)

jorisvandenbossche · web-flow · commit 36ff425d12ee · 2021-02-24T19:39:26.000-05:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -153,6 +153,9 @@ jobs:
       run: |
         source activate pandas-dev
         pytest pandas/tests/frame/methods --array-manager
+        pytest pandas/tests/frame/test_reductions.py --array-manager
+        pytest pandas/tests/reductions/ --array-manager
+        pytest pandas/tests/generic/test_generic.py --array-manager
         pytest pandas/tests/arithmetic/ --array-manager
         pytest pandas/tests/reshape/merge --array-manager
 
diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
@@ -16,7 +16,10 @@
 
 import numpy as np
 
-from pandas._libs import lib
+from pandas._libs import (
+    NaT,
+    lib,
+)
 from pandas._typing import (
     ArrayLike,
     DtypeObj,
@@ -33,6 +36,8 @@
     is_dtype_equal,
     is_extension_array_dtype,
     is_numeric_dtype,
+    is_object_dtype,
+    is_timedelta64_ns_dtype,
 )
 from pandas.core.dtypes.dtypes import (
     ExtensionDtype,
@@ -50,7 +55,11 @@
 import pandas.core.algorithms as algos
 from pandas.core.arrays import ExtensionArray
 from pandas.core.arrays.sparse import SparseDtype
-from pandas.core.construction import extract_array
+from pandas.core.construction import (
+    ensure_wrapped_if_datetimelike,
+    extract_array,
+    sanitize_array,
+)
 from pandas.core.indexers import maybe_convert_indices
 from pandas.core.indexes.api import (
     Index,
@@ -201,18 +210,48 @@ def _verify_integrity(self) -> None:
     def reduce(
         self: T, func: Callable, ignore_failures: bool = False
     ) -> Tuple[T, np.ndarray]:
-        # TODO this still fails because `func` assumes to work on 2D arrays
-        # TODO implement ignore_failures
-        assert self.ndim == 2
+        """
+        Apply reduction function column-wise, returning a single-row ArrayManager.
 
-        res_arrays = []
-        for arr in self.arrays:
-            res = func(arr, axis=0)
-            res_arrays.append(np.array([res]))
+        Parameters
+        ----------
+        func : reduction function
+        ignore_failures : bool, default False
+            Whether to drop columns where func raises TypeError.
 
-        index = Index([None])  # placeholder
-        new_mgr = type(self)(res_arrays, [index, self.items])
-        indexer = np.arange(self.shape[0])
+        Returns
+        -------
+        ArrayManager
+        np.ndarray
+            Indexer of column indices that are retained.
+        """
+        result_arrays: List[np.ndarray] = []
+        result_indices: List[int] = []
+        for i, arr in enumerate(self.arrays):
+            try:
+                res = func(arr, axis=0)
+            except TypeError:
+                if not ignore_failures:
+                    raise
+            else:
+                # TODO NaT doesn't preserve dtype, so we need to ensure to create
+                # a timedelta result array if original was timedelta
+                # what if datetime results in timedelta? (eg std)
+                if res is NaT and is_timedelta64_ns_dtype(arr.dtype):
+                    result_arrays.append(np.array(["NaT"], dtype="timedelta64[ns]"))
+                else:
+                    result_arrays.append(sanitize_array([res], None))
+                result_indices.append(i)
+
+        index = Index._simple_new(np.array([None], dtype=object))  # placeholder
+        if ignore_failures:
+            indexer = np.array(result_indices)
+            columns = self.items[result_indices]
+        else:
+            indexer = np.arange(self.shape[0])
+            columns = self.items
+
+        new_mgr = type(self)(result_arrays, [index, columns])
         return new_mgr, indexer
 
     def operate_blockwise(self, other: ArrayManager, array_op) -> ArrayManager:
@@ -489,14 +528,17 @@ def _get_data_subset(self, predicate: Callable) -> ArrayManager:
 
     def get_bool_data(self, copy: bool = False) -> ArrayManager:
         """
-        Select columns that are bool-dtype.
+        Select columns that are bool-dtype and object-dtype columns that are all-bool.
 
         Parameters
         ----------
         copy : bool, default False
             Whether to copy the blocks
         """
-        return self._get_data_subset(lambda arr: is_bool_dtype(arr.dtype))
+        return self._get_data_subset(
+            lambda arr: is_bool_dtype(arr.dtype)
+            or (is_object_dtype(arr.dtype) and lib.is_bool_array(arr))
+        )
 
     def get_numeric_data(self, copy: bool = False) -> ArrayManager:
         """
@@ -693,6 +735,10 @@ def iset(self, loc: Union[int, slice, np.ndarray], value):
                 assert value.shape[1] == 1
                 value = value[0, :]
 
+            # TODO we receive a datetime/timedelta64 ndarray from DataFrame._iset_item
+            # but we should avoid that and pass directly the proper array
+            value = ensure_wrapped_if_datetimelike(value)
+
             assert isinstance(value, (np.ndarray, ExtensionArray))
             assert value.ndim == 1
             assert len(value) == len(self._axes[0])
diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
@@ -6,6 +6,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
 from pandas import (
     Categorical,
@@ -291,6 +293,7 @@ def test_numpy_minmax_timedelta64(self):
         with pytest.raises(ValueError, match=errmsg):
             np.argmax(td, out=0)
 
+    @td.skip_array_manager_not_yet_implemented  # TODO(ArrayManager) quantile
     def test_timedelta_ops(self):
         # GH#4984
         # make sure ops return Timedelta