From e14e16f593dfe7fe63e97a148d5c5b8d98408b7c Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 26 Jan 2022 08:27:45 -0800 Subject: [PATCH 1/3] BUG: ArrayManager indexing mismatched behavior --- pandas/core/indexing.py | 13 ++++++------- pandas/core/internals/array_manager.py | 7 +++---- pandas/tests/indexing/test_iloc.py | 18 ++++-------------- pandas/tests/indexing/test_indexing.py | 5 ----- pandas/tests/indexing/test_loc.py | 13 +++++-------- 5 files changed, 18 insertions(+), 38 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 5f9bc142c5836..a98ba832e8a62 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -20,6 +20,7 @@ from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.cast import can_hold_element from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, @@ -1583,15 +1584,13 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): # if there is only one block/type, still have to take split path # unless the block is one-dimensional or it can hold the value - if ( - not take_split_path - and getattr(self.obj._mgr, "blocks", False) - and self.ndim > 1 - ): + if not take_split_path and len(self.obj._mgr.arrays) and self.ndim > 1: # in case of dict, keys are indices val = list(value.values()) if isinstance(value, dict) else value - blk = self.obj._mgr.blocks[0] - take_split_path = not blk._can_hold_element(val) + arr = self.obj._mgr.arrays[0] + take_split_path = not can_hold_element( + arr, extract_array(val, extract_numpy=True) + ) # if we have any multi-indexes that have non-trivial slices # (not null slices) then we must take the split path, xref diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index a837c35b89f1a..682fda8ea1f9e 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -341,9 +341,8 @@ def where(self: T, other, cond, align: bool) -> T: cond=cond, ) - # TODO what is this used for? - # def setitem(self, indexer, value) -> ArrayManager: - # return self.apply_with_block("setitem", indexer=indexer, value=value) + def setitem(self, indexer, value) -> ArrayManager: + return self.apply_with_block("setitem", indexer=indexer, value=value) def putmask(self, mask, new, align: bool = True): if align: @@ -467,7 +466,7 @@ def is_view(self) -> bool: @property def is_single_block(self) -> bool: - return False + return len(self.arrays) == 1 def _get_data_subset(self: T, predicate: Callable) -> T: indices = [i for i, arr in enumerate(self.arrays) if predicate(arr)] diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index e028f6f293c62..68c2e3198a8c2 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -82,9 +82,7 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manage overwrite = isinstance(key, slice) and key == slice(None) - if overwrite or using_array_manager: - # TODO(ArrayManager) we always overwrite because ArrayManager takes - # the "split" path, which still overwrites + if overwrite: # TODO: GH#39986 this probably shouldn't behave differently expected = DataFrame({0: cat}) assert not np.shares_memory(df.values, orig_vals) @@ -108,13 +106,13 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manage tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("box", [array, Series]) - def test_iloc_setitem_ea_inplace(self, frame_or_series, box, using_array_manager): + def test_iloc_setitem_ea_inplace(self, frame_or_series, box): # GH#38952 Case with not setting a full column # IntegerArray without NAs arr = array([1, 2, 3, 4]) obj = frame_or_series(arr.to_numpy("i8")) - if frame_or_series is Series or not using_array_manager: + if frame_or_series is Series: values = obj.values else: values = obj[0].values @@ -131,10 +129,7 @@ def test_iloc_setitem_ea_inplace(self, frame_or_series, box, using_array_manager if frame_or_series is Series: assert obj.values is values else: - if using_array_manager: - assert obj[0].values is values - else: - assert obj.values.base is values.base and values.base is not None + assert np.shares_memory(obj[0].values, values) def test_is_scalar_access(self): # GH#32085 index with duplicates doesn't matter for _is_scalar_access @@ -999,9 +994,6 @@ def test_iloc_getitem_readonly_key(self): expected = df["data"].loc[[1, 3, 6]] tm.assert_series_equal(result, expected) - # TODO(ArrayManager) setting single item with an iterable doesn't work yet - # in the "split" path - @td.skip_array_manager_not_yet_implemented def test_iloc_assign_series_to_df_cell(self): # GH 37593 df = DataFrame(columns=["a"], index=[0]) @@ -1224,8 +1216,6 @@ def test_iloc_getitem_setitem_fancy_exceptions(self, float_frame): # GH#32257 we let numpy do validation, get their exception float_frame.iloc[:, :, :] = 1 - # TODO(ArrayManager) "split" path doesn't properly implement DataFrame indexer - @td.skip_array_manager_not_yet_implemented def test_iloc_frame_indexer(self): # GH#39004 df = DataFrame({"a": [1, 2, 3]}) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index c91a5c2764b34..6e1c3686f5dbe 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -7,8 +7,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -504,9 +502,6 @@ def test_multi_assign_broadcasting_rhs(self): df.loc[df["A"] == 0, ["A", "B"]] = df["D"] tm.assert_frame_equal(df, expected) - # TODO(ArrayManager) setting single item with an iterable doesn't work yet - # in the "split" path - @td.skip_array_manager_not_yet_implemented def test_setitem_list(self): # GH 6043 diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 4b751fa7d5e3e..27c8efd89a9b4 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -669,7 +669,7 @@ def test_loc_modify_datetime(self): tm.assert_frame_equal(df, expected) - def test_loc_setitem_frame_with_reindex(self, using_array_manager): + def test_loc_setitem_frame_with_reindex(self): # GH#6254 setting issue df = DataFrame(index=[3, 5, 4], columns=["A"], dtype=float) df.loc[[4, 3, 5], "A"] = np.array([1, 2, 3], dtype="int64") @@ -677,10 +677,6 @@ def test_loc_setitem_frame_with_reindex(self, using_array_manager): # setting integer values into a float dataframe with loc is inplace, # so we retain float dtype ser = Series([2, 3, 1], index=[3, 5, 4], dtype=float) - if using_array_manager: - # TODO(ArrayManager) with "split" path, we still overwrite the column - # and therefore don't take the dtype of the underlying object into account - ser = Series([2, 3, 1], index=[3, 5, 4], dtype="int64") expected = DataFrame({"A": ser}) tm.assert_frame_equal(df, expected) @@ -702,9 +698,6 @@ def test_loc_setitem_frame_with_inverted_slice(self): expected = DataFrame({"A": [3, 2, 1], "B": "string"}, index=[1, 2, 3]) tm.assert_frame_equal(df, expected) - # TODO(ArrayManager) "split" path overwrites column and therefore don't take - # the dtype of the underlying object into account - @td.skip_array_manager_not_yet_implemented def test_loc_setitem_empty_frame(self): # GH#6252 setting with an empty frame keys1 = ["@" + str(i) for i in range(5)] @@ -1225,6 +1218,10 @@ def test_loc_getitem_time_object(self, frame_or_series): @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"]) @pytest.mark.parametrize("dtype", [np.int64, np.float64, complex]) @td.skip_if_no_scipy + @pytest.mark.filterwarnings( + # TODO(2.0): remove filtering; note only needed for using_array_manager + "ignore:The behavior of .astype from SparseDtype.*FutureWarning" + ) def test_loc_getitem_range_from_spmatrix(self, spmatrix_t, dtype): import scipy.sparse From 42a0cdcbc27789cff5745d436e833a5eb49fd950 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 26 Jan 2022 10:00:45 -0800 Subject: [PATCH 2/3] Remove ArrayManager xfails --- pandas/tests/extension/base/setitem.py | 20 -------------------- pandas/tests/frame/indexing/test_indexing.py | 2 -- pandas/tests/frame/methods/test_quantile.py | 14 +------------- 3 files changed, 1 insertion(+), 35 deletions(-) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 208a1a1757be2..7a822bd3c607a 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -1,13 +1,6 @@ import numpy as np import pytest -from pandas.core.dtypes.dtypes import ( - DatetimeTZDtype, - IntervalDtype, - PandasDtype, - PeriodDtype, -) - import pandas as pd import pandas._testing as tm from pandas.tests.extension.base.base import BaseExtensionTests @@ -367,19 +360,6 @@ def test_setitem_series(self, data, full_indexer): def test_setitem_frame_2d_values(self, data, request): # GH#44514 df = pd.DataFrame({"A": data}) - - # Avoiding using_array_manager fixture - # https://github.com/pandas-dev/pandas/pull/44514#discussion_r754002410 - using_array_manager = isinstance(df._mgr, pd.core.internals.ArrayManager) - if using_array_manager: - if not isinstance( - data.dtype, (PandasDtype, PeriodDtype, IntervalDtype, DatetimeTZDtype) - ): - # These dtypes have non-broken implementations of _can_hold_element - mark = pytest.mark.xfail(reason="Goes through split path, loses dtype") - request.node.add_marker(mark) - - df = pd.DataFrame({"A": data}) orig = df.copy() df.iloc[:] = df diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 5a97c591fd621..928f1742e1898 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1212,8 +1212,6 @@ def test_setitem_array_as_cell_value(self): expected = DataFrame({"a": [np.zeros((2,))], "b": [np.zeros((2, 2))]}) tm.assert_frame_equal(df, expected) - # with AM goes through split-path, loses dtype - @td.skip_array_manager_not_yet_implemented def test_iloc_setitem_nullable_2d_values(self): df = DataFrame({"A": [1, 2, 3]}, dtype="Int64") orig = df.copy() diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index d60fe7d4f316d..76ef57391a7b3 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -673,19 +673,7 @@ def test_quantile_ea_with_na(self, obj, index): # TODO(GH#39763): filtering can be removed after GH#39763 is fixed @pytest.mark.filterwarnings("ignore:Using .astype to convert:FutureWarning") - def test_quantile_ea_all_na( - self, obj, index, frame_or_series, using_array_manager, request - ): - if ( - using_array_manager - and frame_or_series is DataFrame - and index.dtype == "m8[ns]" - ): - mark = pytest.mark.xfail( - reason="obj.astype fails bc obj is incorrectly dt64 at this point" - ) - request.node.add_marker(mark) - + def test_quantile_ea_all_na(self, obj, index, frame_or_series, request): obj.iloc[:] = index._na_value # TODO(ArrayManager): this casting should be unnecessary after GH#39763 is fixed From 114045c11d740418031cfbf56575b3220b105946 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 26 Jan 2022 13:28:31 -0800 Subject: [PATCH 3/3] mypy fixup --- pandas/core/internals/array_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 682fda8ea1f9e..80e1d0bede2cd 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -341,7 +341,7 @@ def where(self: T, other, cond, align: bool) -> T: cond=cond, ) - def setitem(self, indexer, value) -> ArrayManager: + def setitem(self: T, indexer, value) -> T: return self.apply_with_block("setitem", indexer=indexer, value=value) def putmask(self, mask, new, align: bool = True):