From 0644d90100bcc14e5e548bbef03a6b84af57b442 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 10 Feb 2021 10:24:03 +0100 Subject: [PATCH 1/4] [ArrayManager] Test DataFrame reductions + implement ignore_failures --- .github/workflows/ci.yml | 2 + pandas/core/internals/array_manager.py | 63 +++++++++++++++++----- pandas/tests/frame/test_reductions.py | 24 +++++++-- pandas/tests/reductions/test_reductions.py | 3 ++ 4 files changed, 74 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b551e7ded0178..0fc7b4786fc49 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -157,3 +157,5 @@ jobs: run: | source activate pandas-dev pytest pandas/tests/frame/methods --array-manager + pytest pandas/tests/frame/test_reductions.py --array-manager + pytest pandas/tests/reductions/ --array-manager diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 0f677ff3180be..4429ac5b759c2 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -7,7 +7,7 @@ import numpy as np -from pandas._libs import algos as libalgos, lib +from pandas._libs import NaT, algos as libalgos, lib from pandas._typing import ArrayLike, DtypeObj, Hashable from pandas.util._validators import validate_bool_kwarg @@ -17,6 +17,7 @@ is_dtype_equal, is_extension_array_dtype, is_numeric_dtype, + is_timedelta64_ns_dtype, ) from pandas.core.dtypes.dtypes import ExtensionDtype, PandasDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -25,7 +26,11 @@ import pandas.core.algorithms as algos from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseDtype -from pandas.core.construction import extract_array +from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, + extract_array, + sanitize_array, +) from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.api import Index, ensure_index from pandas.core.internals.base import DataManager @@ -173,18 +178,48 @@ def _verify_integrity(self) -> None: def reduce( self: T, func: Callable, ignore_failures: bool = False ) -> Tuple[T, np.ndarray]: - # TODO this still fails because `func` assumes to work on 2D arrays - # TODO implement ignore_failures - assert self.ndim == 2 + """ + Apply reduction function column-wise, returning a single-row ArrayManager. - res_arrays = [] - for arr in self.arrays: - res = func(arr, axis=0) - res_arrays.append(np.array([res])) + Parameters + ---------- + func : reduction function + ignore_failures : bool, default False + Whether to drop columns where func raises TypeError. + + Returns + ------- + ArrayManager + np.ndarray + Indexer of column indices that are retained. + """ + result_arrays: List[np.ndarray] = [] + result_indices: List[int] = [] + for i, arr in enumerate(self.arrays): + try: + res = func(arr, axis=0) + except TypeError: + if not ignore_failures: + raise + else: + # TODO NaT doesn't preserve dtype, so we need to ensure to create + # a timedelta result array if original was timedelta + # what if datetime results in timedelta? (eg std) + if res is NaT and is_timedelta64_ns_dtype(arr.dtype): + result_arrays.append(np.array(["NaT"], dtype="timedelta64[ns]")) + else: + result_arrays.append(sanitize_array([res], None)) + result_indices.append(i) index = Index([None]) # placeholder - new_mgr = type(self)(res_arrays, [index, self.items]) - indexer = np.arange(self.shape[0]) + if ignore_failures: + indexer = np.array(result_indices) + columns = self.items[result_indices] + else: + indexer = np.arange(self.shape[0]) + columns = self.items + + new_mgr = type(self)(result_arrays, [index, columns]) return new_mgr, indexer def operate_blockwise(self, other: ArrayManager, array_op) -> ArrayManager: @@ -668,9 +703,11 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): if isinstance(value, np.ndarray) and value.ndim == 2: value = value[0, :] + # TODO we receive a datetime/timedelta64 ndarray from DataFrame._iset_item + # but we should avoid that and pass directly the proper array + value = ensure_wrapped_if_datetimelike(value) + assert isinstance(value, (np.ndarray, ExtensionArray)) - # value = np.asarray(value) - # assert isinstance(value, np.ndarray) assert len(value) == len(self._axes[0]) self.arrays[loc] = value return diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 1c397d6a6a1b5..839b2680ebc95 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1145,7 +1145,7 @@ def test_any_all_object(self): result = np.any(DataFrame(columns=["a", "b"])).item() assert result is False - def test_any_all_object_bool_only(self): + def test_any_all_object_bool_only(self, using_array_manager): df = DataFrame({"A": ["foo", 2], "B": [True, False]}).astype(object) df._consolidate_inplace() df["C"] = Series([True, True]) @@ -1153,21 +1153,35 @@ def test_any_all_object_bool_only(self): # The underlying bug is in DataFrame._get_bool_data, so we check # that while we're here res = df._get_bool_data() - expected = df[["B", "C"]] + + # With ArrayManager currently don't infer object dtype to be boolean + if using_array_manager: + expected = df[["C"]] + else: + expected = df[["B", "C"]] tm.assert_frame_equal(res, expected) res = df.all(bool_only=True, axis=0) - expected = Series([False, True], index=["B", "C"]) + if using_array_manager: + expected = Series([True], index=["C"]) + else: + expected = Series([False, True], index=["B", "C"]) tm.assert_series_equal(res, expected) # operating on a subset of columns should not produce a _larger_ Series res = df[["B", "C"]].all(bool_only=True, axis=0) tm.assert_series_equal(res, expected) - assert not df.all(bool_only=True, axis=None) + if using_array_manager: + assert df.all(bool_only=True, axis=None) + else: + assert not df.all(bool_only=True, axis=None) res = df.any(bool_only=True, axis=0) - expected = Series([True, True], index=["B", "C"]) + if using_array_manager: + expected = Series([True], index=["C"]) + else: + expected = Series([True, True], index=["B", "C"]) tm.assert_series_equal(res, expected) # operating on a subset of columns should not produce a _larger_ Series diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index cb64b2423696f..3e85a75aa350b 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -288,6 +290,7 @@ def test_numpy_minmax_timedelta64(self): with pytest.raises(ValueError, match=errmsg): np.argmax(td, out=0) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile def test_timedelta_ops(self): # GH#4984 # make sure ops return Timedelta From 77395d880c287ed69cd315df9cd51366c3918e61 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 11 Feb 2021 09:28:26 +0100 Subject: [PATCH 2/4] temp --- pandas/core/internals/array_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 4429ac5b759c2..06a501af905ea 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -32,7 +32,7 @@ sanitize_array, ) from pandas.core.indexers import maybe_convert_indices -from pandas.core.indexes.api import Index, ensure_index +from pandas.core.indexes.api import Index, RangeIndex, ensure_index from pandas.core.internals.base import DataManager from pandas.core.internals.blocks import make_block @@ -211,7 +211,7 @@ def reduce( result_arrays.append(sanitize_array([res], None)) result_indices.append(i) - index = Index([None]) # placeholder + index = RangeIndex(1) # placeholder if ignore_failures: indexer = np.array(result_indices) columns = self.items[result_indices] From ac6117a572346fd13578064c7a9d3edf0b24837a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Feb 2021 09:52:00 +0100 Subject: [PATCH 3/4] infer bool for object dtype --- pandas/core/internals/array_manager.py | 23 ++++++++++++++--------- pandas/tests/frame/test_reductions.py | 24 +++++------------------- 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 06a501af905ea..fad74bd251c1a 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -17,6 +17,7 @@ is_dtype_equal, is_extension_array_dtype, is_numeric_dtype, + is_object_dtype, is_timedelta64_ns_dtype, ) from pandas.core.dtypes.dtypes import ExtensionDtype, PandasDtype @@ -32,7 +33,7 @@ sanitize_array, ) from pandas.core.indexers import maybe_convert_indices -from pandas.core.indexes.api import Index, RangeIndex, ensure_index +from pandas.core.indexes.api import Index, ensure_index from pandas.core.internals.base import DataManager from pandas.core.internals.blocks import make_block @@ -211,7 +212,7 @@ def reduce( result_arrays.append(sanitize_array([res], None)) result_indices.append(i) - index = RangeIndex(1) # placeholder + index = Index._simple_new(np.array([None], dtype=object)) # placeholder if ignore_failures: indexer = np.array(result_indices) columns = self.items[result_indices] @@ -508,15 +509,19 @@ def is_single_block(self) -> bool: def get_bool_data(self, copy: bool = False) -> ArrayManager: """ - Parameters - ---------- - copy : bool, default False - Whether to copy the blocks + Select columns that are bool-dtype and object-dtype columns that are all-bool. """ - mask = np.array([is_bool_dtype(t) for t in self.get_dtypes()], dtype="object") - arrays = [self.arrays[i] for i in np.nonzero(mask)[0]] + arrays = [] + indices = [] + for i, arr in enumerate(self.arrays): + if is_bool_dtype(arr.dtype) or ( + is_object_dtype(arr.dtype) and lib.is_bool_array(arr) + ): + arrays.append(arr) + indices.append(i) + # TODO copy? - new_axes = [self._axes[0], self._axes[1][mask]] + new_axes = [self._axes[0], self._axes[1][indices]] return type(self)(arrays, new_axes) def get_numeric_data(self, copy: bool = False) -> ArrayManager: diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 839b2680ebc95..1c397d6a6a1b5 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1145,7 +1145,7 @@ def test_any_all_object(self): result = np.any(DataFrame(columns=["a", "b"])).item() assert result is False - def test_any_all_object_bool_only(self, using_array_manager): + def test_any_all_object_bool_only(self): df = DataFrame({"A": ["foo", 2], "B": [True, False]}).astype(object) df._consolidate_inplace() df["C"] = Series([True, True]) @@ -1153,35 +1153,21 @@ def test_any_all_object_bool_only(self, using_array_manager): # The underlying bug is in DataFrame._get_bool_data, so we check # that while we're here res = df._get_bool_data() - - # With ArrayManager currently don't infer object dtype to be boolean - if using_array_manager: - expected = df[["C"]] - else: - expected = df[["B", "C"]] + expected = df[["B", "C"]] tm.assert_frame_equal(res, expected) res = df.all(bool_only=True, axis=0) - if using_array_manager: - expected = Series([True], index=["C"]) - else: - expected = Series([False, True], index=["B", "C"]) + expected = Series([False, True], index=["B", "C"]) tm.assert_series_equal(res, expected) # operating on a subset of columns should not produce a _larger_ Series res = df[["B", "C"]].all(bool_only=True, axis=0) tm.assert_series_equal(res, expected) - if using_array_manager: - assert df.all(bool_only=True, axis=None) - else: - assert not df.all(bool_only=True, axis=None) + assert not df.all(bool_only=True, axis=None) res = df.any(bool_only=True, axis=0) - if using_array_manager: - expected = Series([True], index=["C"]) - else: - expected = Series([True, True], index=["B", "C"]) + expected = Series([True, True], index=["B", "C"]) tm.assert_series_equal(res, expected) # operating on a subset of columns should not produce a _larger_ Series From 748d767cf0a8e1f188b3d4dd1600053c5cc290d0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 15 Feb 2021 19:58:04 +0100 Subject: [PATCH 4/4] enable more tests --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0fc7b4786fc49..a07ae48908b81 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -159,3 +159,4 @@ jobs: pytest pandas/tests/frame/methods --array-manager pytest pandas/tests/frame/test_reductions.py --array-manager pytest pandas/tests/reductions/ --array-manager + pytest pandas/tests/generic/test_generic.py --array-manager