From 46acf444e5b0b8d630623f3b430b5c9bb7c097a2 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 13 Jan 2021 17:32:31 -0800 Subject: [PATCH 01/13] TST: split coercion tests --- .../tests/frame/indexing/test_categorical.py | 87 ++++++++++--------- pandas/tests/indexing/test_loc.py | 53 +++++------ 2 files changed, 70 insertions(+), 70 deletions(-) diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py index 6137cadc93125..329987350b7a9 100644 --- a/pandas/tests/frame/indexing/test_categorical.py +++ b/pandas/tests/frame/indexing/test_categorical.py @@ -54,7 +54,53 @@ def test_assignment(self): cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) df = DataFrame(Series(cat)) - def test_assigning_ops(self): + @pytest.fixture + def dfs_for_assignment(self): + cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) + idx = Index(["h", "i", "j", "k", "l", "m", "n"]) + values = [1, 1, 1, 1, 1, 1, 1] + orig = DataFrame({"cats": cats, "values": values}, index=idx) + + # changed multiple rows + cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values2 = [1, 1, 2, 2, 1, 1, 1] + exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) + return orig, exp_multi_row + + def test_iloc_setitem_multiple_rows(self, dfs_for_assignment): + # - assign multiple rows (mixed values) -> exp_multi_row + orig, exp_multi_row = dfs_for_assignment + df = orig.copy() + + df.iloc[2:4, :] = [["b", 2], ["b", 2]] + tm.assert_frame_equal(df, exp_multi_row) + + msg1 = ( + "Cannot setitem on a Categorical with a new category, " + "set the categories first" + ) + with pytest.raises(ValueError, match=msg1): + df = orig.copy() + df.iloc[2:4, :] = [["c", 2], ["c", 2]] + + def test_loc_setitem_multiple_rows(self, dfs_for_assignment): + # - assign multiple rows (mixed values) -> exp_multi_row + orig, exp_multi_row = dfs_for_assignment + df = orig.copy() + + df.loc["j":"k", :] = [["b", 2], ["b", 2]] + tm.assert_frame_equal(df, exp_multi_row) + + msg1 = ( + "Cannot setitem on a Categorical with a new category, " + "set the categories first" + ) + with pytest.raises(ValueError, match=msg1): + df = orig.copy() + df.loc["j":"k", :] = [["c", 2], ["c", 2]] + + def test_assigning_ops(self, dfs_for_assignment): # systematically test the assigning operations: # for all slicing ops: # for value in categories and value not in categories: @@ -70,11 +116,7 @@ def test_assigning_ops(self): # assign a part of a column with dtype != categorical -> # exp_parts_cats_col - - cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) - idx = Index(["h", "i", "j", "k", "l", "m", "n"]) - values = [1, 1, 1, 1, 1, 1, 1] - orig = DataFrame({"cats": cats, "values": values}, index=idx) + orig, exp_multi_row = dfs_for_assignment # the expected values # changed single row @@ -83,12 +125,6 @@ def test_assigning_ops(self): values1 = [1, 1, 2, 1, 1, 1, 1] exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) - # changed multiple rows - cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) - idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values2 = [1, 1, 2, 2, 1, 1, 1] - exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) - # changed part of the cats column cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) idx3 = Index(["h", "i", "j", "k", "l", "m", "n"]) @@ -134,15 +170,6 @@ def test_assigning_ops(self): df = orig.copy() df.iloc[2, :] = ["c", 2] - # - assign multiple rows (mixed values) -> exp_multi_row - df = orig.copy() - df.iloc[2:4, :] = [["b", 2], ["b", 2]] - tm.assert_frame_equal(df, exp_multi_row) - - with pytest.raises(ValueError, match=msg1): - df = orig.copy() - df.iloc[2:4, :] = [["c", 2], ["c", 2]] - # assign a part of a column with dtype == categorical -> # exp_parts_cats_col df = orig.copy() @@ -194,15 +221,6 @@ def test_assigning_ops(self): df = orig.copy() df.loc["j", :] = ["c", 2] - # - assign multiple rows (mixed values) -> exp_multi_row - df = orig.copy() - df.loc["j":"k", :] = [["b", 2], ["b", 2]] - tm.assert_frame_equal(df, exp_multi_row) - - with pytest.raises(ValueError, match=msg1): - df = orig.copy() - df.loc["j":"k", :] = [["c", 2], ["c", 2]] - # assign a part of a column with dtype == categorical -> # exp_parts_cats_col df = orig.copy() @@ -258,15 +276,6 @@ def test_assigning_ops(self): df = orig.copy() df.loc["j", :] = ["c", 2] - # - assign multiple rows (mixed values) -> exp_multi_row - df = orig.copy() - df.loc["j":"k", :] = [["b", 2], ["b", 2]] - tm.assert_frame_equal(df, exp_multi_row) - - with pytest.raises(ValueError, match=msg1): - df = orig.copy() - df.loc["j":"k", :] = [["c", 2], ["c", 2]] - # assign a part of a column with dtype == categorical -> # exp_parts_cats_col df = orig.copy() diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 7c73917e44b22..8b13bafdd012f 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -395,73 +395,64 @@ def test_loc_general(self): tm.assert_series_equal(result, expected) assert result.dtype == object - def test_loc_setitem_consistency(self): - # GH 6149 - # coerce similarly for setitem and loc when rows have a null-slice - expected = DataFrame( + @pytest.fixture + def frame_for_consistency(self): + return DataFrame( { - "date": Series(0, index=range(5), dtype=np.int64), + "date": date_range("2000-01-01", "2000-01-5"), "val": Series(range(5), dtype=np.int64), } ) - df = DataFrame( + def test_loc_setitem_consistency(self, frame_for_consistency): + # GH 6149 + # coerce similarly for setitem and loc when rows have a null-slice + expected = DataFrame( { - "date": date_range("2000-01-01", "2000-01-5"), + "date": Series(0, index=range(5), dtype=np.int64), "val": Series(range(5), dtype=np.int64), } ) + df = frame_for_consistency.copy() df.loc[:, "date"] = 0 tm.assert_frame_equal(df, expected) - df = DataFrame( - { - "date": date_range("2000-01-01", "2000-01-5"), - "val": Series(range(5), dtype=np.int64), - } - ) + df = frame_for_consistency.copy() df.loc[:, "date"] = np.array(0, dtype=np.int64) tm.assert_frame_equal(df, expected) - df = DataFrame( - { - "date": date_range("2000-01-01", "2000-01-5"), - "val": Series(range(5), dtype=np.int64), - } - ) + df = frame_for_consistency.copy() df.loc[:, "date"] = np.array([0, 0, 0, 0, 0], dtype=np.int64) tm.assert_frame_equal(df, expected) + def test_loc_setitem_consistency_dt64_to_str(self, frame_for_consistency): + # GH 6149 + # coerce similarly for setitem and loc when rows have a null-slice + expected = DataFrame( { "date": Series("foo", index=range(5)), "val": Series(range(5), dtype=np.int64), } ) - df = DataFrame( - { - "date": date_range("2000-01-01", "2000-01-5"), - "val": Series(range(5), dtype=np.int64), - } - ) + df = frame_for_consistency.copy() df.loc[:, "date"] = "foo" tm.assert_frame_equal(df, expected) + def test_loc_setitem_consistency_dt64_to_float(self, frame_for_consistency): + # GH 6149 + # coerce similarly for setitem and loc when rows have a null-slice expected = DataFrame( { "date": Series(1.0, index=range(5)), "val": Series(range(5), dtype=np.int64), } ) - df = DataFrame( - { - "date": date_range("2000-01-01", "2000-01-5"), - "val": Series(range(5), dtype=np.int64), - } - ) + df = frame_for_consistency.copy() df.loc[:, "date"] = 1.0 tm.assert_frame_equal(df, expected) + def test_loc_setitem_consistency_single_row(self): # GH 15494 # setting on frame with single row df = DataFrame({"date": Series([Timestamp("20180101")])}) From 5ca4ab25bfbc502def0d620a136f9823341d6c56 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 13 Jan 2021 17:36:17 -0800 Subject: [PATCH 02/13] REF: implement BlockManager.setitem2 --- pandas/core/dtypes/missing.py | 19 ++++++++- pandas/core/frame.py | 10 ++++- pandas/core/indexing.py | 13 ++++--- pandas/core/internals/blocks.py | 48 +++++++++++++++-------- pandas/core/internals/managers.py | 64 +++++++++++++++++++++++++++++++ pandas/core/series.py | 2 +- 6 files changed, 130 insertions(+), 26 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index f0455c01fa085..a30875a2783d2 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -506,7 +506,7 @@ def array_equals(left: ArrayLike, right: ArrayLike) -> bool: return array_equivalent(left, right, dtype_equal=True) -def infer_fill_value(val): +def infer_fill_value(val, length: int): """ infer the fill value for the nan/NaT from the provided scalar/ndarray/list-like if we are a NaT, return the correct dtyped @@ -514,6 +514,23 @@ def infer_fill_value(val): """ if not is_list_like(val): val = [val] + + if type(val).__name__ == "PandasArray": + # for test_numpy test where we patch PandasArray._typ + val = val.to_numpy() + + if is_extension_array_dtype(val): + # We cannot use dtype._na_value bc pd.NA/pd.NaT do not preserve dtype + if len(val) == length: + # TODO: in this case see if we can avoid making a copy later on + return val + if length == 0: + return val[:0].copy() + + dtype = val.dtype + cls = dtype.construct_array_type() + return cls._from_sequence([dtype._na_value], dtype=dtype).repeat(length) + val = np.array(val, copy=False) if needs_i8_conversion(val.dtype): return np.array("NaT", dtype=val.dtype) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 36ccd0b8a2f7d..787cb0a1ff12b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3229,7 +3229,13 @@ def _setitem_array(self, key, value): key, axis=1, raise_missing=False )[1] self._check_setitem_copy() - self.iloc[:, indexer] = value + + if is_scalar(value): + indexer = self.iloc._ensure_iterable_column_indexer(indexer) + for i in indexer: + self[self.columns[i]] = value + else: + self.iloc[:, indexer] = value # TODO: indicate not-inplace def _setitem_frame(self, key, value): # support boolean setting with DataFrame input, e.g. @@ -3286,7 +3292,7 @@ def _set_item_mgr(self, key, value): if len(self): self._check_setitem_copy() - def _iset_item(self, loc: int, value): + def _iset_item(self, loc: int, value): # only called from _setitem_single_column value = self._sanitize_column(value) value = _maybe_atleast_2d(value) self._iset_item_mgr(loc, value) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index f1f3265c9f970..31aa3c81374bf 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1603,7 +1603,7 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): # We are setting an entire column self.obj[key] = value else: - self.obj[key] = infer_fill_value(value) + self.obj[key] = infer_fill_value(value, len(self.obj)) new_indexer = convert_from_missing_indexer_tuple( indexer, self.obj.axes @@ -1725,8 +1725,8 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): else: # scalar value - for loc in ilocs: - self._setitem_single_column(loc, value, pi) + self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), value) + self.obj._clear_item_cache() def _setitem_with_indexer_2d_value(self, indexer, value): # We get here with np.ndim(value) == 2, excluding DataFrame, @@ -1742,6 +1742,8 @@ def _setitem_with_indexer_2d_value(self, indexer, value): "Must have equal len keys and value when setting with an ndarray" ) + # self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), value) + # need to make setitem2 re-coerce for i, loc in enumerate(ilocs): # setting with a list, re-coerces self._setitem_single_column(loc, value[:, i].tolist(), pi) @@ -1758,9 +1760,8 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str # We do not want to align the value in case of iloc GH#37728 if name == "iloc": - for i, loc in enumerate(ilocs): - val = value.iloc[:, i] - self._setitem_single_column(loc, val, pi) + self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), value) + self.obj._clear_item_cache() elif not unique_cols and value.columns.equals(self.obj.columns): # We assume we are already aligned, see diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6f6f17171537f..cbf4f066598dc 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -902,8 +902,24 @@ def setitem(self, indexer, value): if self.is_numeric: value = np.nan - # coerce if block dtype can store value values = self.values + + if is_extension_array_dtype(getattr(value, "dtype", None)): + # We need to be careful not to allow through strings that + # can be parsed to EADtypes + is_ea_value = True + arr_value = value + else: + is_ea_value = False + arr_value = np.array(value) + + if transpose: + values = values.T + + # length checking + check_setitem_lengths(indexer, value, values) + exact_match = is_exact_shape_match(values, arr_value) + if not self._can_hold_element(value): # current dtype cannot store value, coerce to common dtype # TODO: can we just use coerce_to_target_dtype for all this @@ -919,6 +935,14 @@ def setitem(self, indexer, value): dtype, _ = maybe_promote(np.array(value).dtype) return self.astype(dtype).setitem(indexer, value) + if isinstance(indexer, tuple) and len(indexer) == self.ndim: + if com.is_null_slice(indexer[0]): + value2 = lib.item_from_zerodim(value) + if lib.is_scalar(value2): + # TODO: de-duplicate with similar in setitem_single_block + value2 = np.full(self.shape, arr_value) + return self.make_block(value2) + dtype = find_common_type([values.dtype, dtype]) assert not is_dtype_equal(self.dtype, dtype) # otherwise should have _can_hold_element @@ -931,21 +955,6 @@ def setitem(self, indexer, value): return self # value must be storable at this moment - if is_extension_array_dtype(getattr(value, "dtype", None)): - # We need to be careful not to allow through strings that - # can be parsed to EADtypes - is_ea_value = True - arr_value = value - else: - is_ea_value = False - arr_value = np.array(value) - - if transpose: - values = values.T - - # length checking - check_setitem_lengths(indexer, value, values) - exact_match = is_exact_shape_match(values, arr_value) if is_empty_indexer(indexer, arr_value): # GH#8669 empty indexers pass @@ -1669,6 +1678,13 @@ def setitem(self, indexer, value): # we are always 1-D indexer = indexer[0] + if isinstance(value, (np.ndarray, ExtensionArray)) and value.ndim == self.ndim == 2: + # TODO: test for this + value = value.T + if value.shape[0] != 1: + raise ValueError + value = value[0] + check_setitem_lengths(indexer, value, self.values) self.values[indexer] = value return self diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index cc5576719ff43..51ac5ddcd3f72 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -38,6 +38,7 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCPandasArray, ABCSeries from pandas.core.dtypes.missing import array_equals, isna +from pandas.core.indexing import maybe_convert_ix import pandas.core.algorithms as algos from pandas.core.arrays.sparse import SparseDtype from pandas.core.construction import extract_array @@ -562,6 +563,69 @@ def where(self, other, cond, align: bool, errors: str, axis: int) -> "BlockManag def setitem(self, indexer, value) -> "BlockManager": return self.apply("setitem", indexer=indexer, value=value) + # TODO: could just operate inplace, so we dont end up swapping out + # parent frame/series _mgr? + def setitem2(self, indexer, value) -> "BlockManager": + result_blocks = [] + + # assuming for now 2D + pi, col_indexer = indexer + + if lib.is_integer(col_indexer): + col_indexer = [col_indexer] + col_indexer = Index(col_indexer) + col_indexer2 = list(col_indexer) + + def handle_block(blk: Block) -> List[Block]: + locs = Index(blk.mgr_locs.as_array).intersection(col_indexer) + ilocs = [list(blk.mgr_locs).index(x) for x in locs] + # iloc2 = self.blklocs[locs] + # assert (ilocs == ilocs2).all(), (ilocs, ilocs2) + # this assertion works for non-recursed blocks + rlocs = [col_indexer2.index(x) for x in locs] + + if not len(ilocs): + nbs = [blk] + else: + is2d = False + value_for_block = value + if getattr(value, "ndim", 0) == 2: + is2d = True + if isinstance(value, ABCDataFrame): + # TODO: similar to what we have in BlockManager.apply? + value_for_block = value.iloc[:, rlocs] + else: + value_for_block = value[:, rlocs] + + blk_indexer = (pi, ilocs) + blk_indexer = maybe_convert_ix(*blk_indexer) + + if blk._can_hold_element(value_for_block) and (not blk.is_object or (is2d and value_for_block.shape[1] == blk.shape[0])): + nb = blk.setitem(blk_indexer, value_for_block) + nbs = [nb] + + elif blk.shape[0] == 1: + # casting + nb = blk.setitem(blk_indexer, value_for_block) + nbs = [nb] + + else: + # recurse -> operate column-wise + blocks = blk._split() + nbs = [] + for subblk in blocks: + nbs2 = handle_block(subblk) + nbs.extend(nbs2) + + return nbs + + for blk in self.blocks: + nbs = handle_block(blk) + + result_blocks.extend(nbs) + + return type(self).from_blocks(result_blocks, self.axes) + def putmask(self, mask, new, align: bool = True, axis: int = 0): if align: diff --git a/pandas/core/series.py b/pandas/core/series.py index 15c7d2b964d79..22b08990e7165 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1020,7 +1020,7 @@ def _set_labels(self, key, value): def _set_values(self, key, value): if isinstance(key, Series): - key = key._values + key = key._values # TODO: has this necessarily been aligned? self._mgr = self._mgr.setitem( # type: ignore[assignment] indexer=key, value=value ) From 77e09e24962a6b6b12b97a48bbd510592d1c6b43 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 14 Jan 2021 07:54:20 -0800 Subject: [PATCH 03/13] checkpoint tests passing --- pandas/core/indexing.py | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 31aa3c81374bf..b42311ad71abf 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -22,6 +22,7 @@ is_list_like, is_numeric_dtype, is_object_dtype, + is_extension_array_dtype, is_scalar, is_sequence, ) @@ -1682,7 +1683,17 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi): # We are setting multiple rows in a single column. - self._setitem_single_column(ilocs[0], value, pi) + if is_extension_array_dtype(value): + # TODO(EA2D): special case not needed with 2D EAs + self._setitem_single_column(ilocs[0], value, pi) + elif len(value) == len(self.obj): + # Setting entire column, so swapping out + # GH#??? we may want to change this behavior + self.obj._iset_item(ilocs[0], value) + else: + val = np.atleast_2d(value).T + self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), val) + self.obj._clear_item_cache() elif len(ilocs) == 1 and 0 != lplane_indexer != len(value): # We are trying to set N values into M entries of a single @@ -1705,6 +1716,18 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): elif len(ilocs) == len(value): # We are setting multiple columns in a single row. + #if is_extension_array_dtype(value): # TODO: not hit + # val = DataFrame.from_arrays([value], index=[0], columns=range(len(value))) + #elif isinstance(value, np.ndarray): + # val = np.atleast_2d(value) + #else: + # # avoid numpy casting which can take e.g. ["b", 2] -> ["b", "2"] + # ser = self.obj._constructor_sliced(value) + # val = ser.to_frame().T + # + #breakpoint() + #self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), val) + #self.obj._clear_item_cache() for loc, v in zip(ilocs, value): self._setitem_single_column(loc, v, pi) @@ -1742,7 +1765,8 @@ def _setitem_with_indexer_2d_value(self, indexer, value): "Must have equal len keys and value when setting with an ndarray" ) - # self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), value) + #self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), value) + #self.obj._clear_item_cache() # need to make setitem2 re-coerce for i, loc in enumerate(ilocs): # setting with a list, re-coerces @@ -1771,7 +1795,7 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str if item in value: sub_indexer[1] = item val = self._align_series( - tuple(sub_indexer), + (pi, item), value.iloc[:, loc], multiindex_indexer, ) @@ -1787,9 +1811,8 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str for loc in ilocs: item = self.obj.columns[loc] if item in value: - sub_indexer[1] = item val = self._align_series( - tuple(sub_indexer), value[item], multiindex_indexer + (pi, item), value[item], multiindex_indexer ) else: val = np.nan From 46126abd88a05520a7575a3f518b5ef64065838f Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 15 Jan 2021 08:52:47 -0800 Subject: [PATCH 04/13] checkpoint passing --- pandas/core/arrays/datetimelike.py | 6 +++ pandas/core/frame.py | 19 +++++++ pandas/core/indexing.py | 49 ++++++++++++------- pandas/core/internals/blocks.py | 20 +++++++- pandas/core/internals/managers.py | 5 +- .../tests/frame/indexing/test_categorical.py | 1 + 6 files changed, 80 insertions(+), 20 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index b31bc0934fe60..565cca3aca7ff 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -60,6 +60,7 @@ pandas_dtype, ) from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna +from pandas.core.dtypes.generic import ABCDataFrame from pandas.core import nanops, ops from pandas.core.algorithms import checked_add_with_arr, isin, unique1d, value_counts @@ -613,6 +614,11 @@ def _validate_listlike(self, value, allow_object: bool = False): # We treat empty list as our own dtype. return type(self)._from_sequence([], dtype=self.dtype) + if isinstance(value, ABCDataFrame) and value.shape[1] == 1: + # FIXME: kludge + res = self._validate_listlike(value._ixs(0, axis=1), allow_object=allow_object) + return res.reshape(-1, 1) + # Do type inference if necessary up front # e.g. we passed PeriodIndex.values and got an ndarray of Periods value = array(value) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 787cb0a1ff12b..163ae121877db 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3209,6 +3209,7 @@ def _setitem_slice(self, key: slice, value): def _setitem_array(self, key, value): # also raises Exception if object array with NA values if com.is_bool_indexer(key): + # bool indexer is indexing along rows if len(key) != len(self.index): raise ValueError( f"Item wrong length {len(key)} instead of {len(self.index)}!" @@ -3223,6 +3224,24 @@ def _setitem_array(self, key, value): raise ValueError("Columns must be same length as key") for k1, k2 in zip(key, value.columns): self[k1] = value[k2] + + elif all(is_hashable(x) for x in key) and any(x not in self.columns for x in key): + # We need to do this instead of going through iloc to ensure + # we get correct dtype for new columns + self._check_setitem_copy() + + # TODO: de-duplicate with some of whats in indexing.py + if is_scalar(value): + for i, x in enumerate(key): + self[x] = value + elif np.ndim(value) == 1: + for i, x in enumerate(key): + self[x] = value[i] + else: + value = np.asarray(value) # TODO: or DataFrame? + for i, x in enumerate(key): + self[x] = value[:, i] + else: self.loc._ensure_listlike_indexer(key, axis=1, value=value) indexer = self.loc._get_listlike_indexer( diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index b42311ad71abf..3bd20f86aabab 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1683,13 +1683,19 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi): # We are setting multiple rows in a single column. - if is_extension_array_dtype(value): - # TODO(EA2D): special case not needed with 2D EAs - self._setitem_single_column(ilocs[0], value, pi) - elif len(value) == len(self.obj): + if len(value) == len(self.obj): # Setting entire column, so swapping out # GH#??? we may want to change this behavior self.obj._iset_item(ilocs[0], value) + elif is_extension_array_dtype(value): + # TODO(EA2D): special case not needed with 2D EAs + obj = type(self.obj)(value) + orig_mgr = self.obj._mgr.copy(deep=True) + new_mgr = self.obj._mgr.setitem2((pi, ilocs), obj) + #self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), obj) + #self.obj._clear_item_cache() + # 1 test stil failing would be fixed by using _setitem_single_column + self._setitem_single_column(ilocs[0], value, pi) else: val = np.atleast_2d(value).T self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), val) @@ -1715,21 +1721,28 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): pass elif len(ilocs) == len(value): - # We are setting multiple columns in a single row. - #if is_extension_array_dtype(value): # TODO: not hit - # val = DataFrame.from_arrays([value], index=[0], columns=range(len(value))) - #elif isinstance(value, np.ndarray): - # val = np.atleast_2d(value) - #else: - # # avoid numpy casting which can take e.g. ["b", 2] -> ["b", "2"] - # ser = self.obj._constructor_sliced(value) - # val = ser.to_frame().T + # We are setting multiple columns in a with one row which we broadcast + if is_extension_array_dtype(value): # TODO: not hit + val = DataFrame.from_arrays([value], index=[0], columns=range(len(value))) + elif isinstance(value, np.ndarray): + val = np.atleast_2d(value) + else: + # avoid numpy casting which can take e.g. ["b", 2] -> ["b", "2"] + #ser = self.obj._constructor_sliced(value) + #val = ser.to_frame().T + val = type(self.obj)([value]) + if lplane_indexer != 1: + # broadcast to length of pi + # TODO: EA compat for broadcast_to + arrs = list(val._iter_column_arrays()) + arrs = [np.broadcast_to(x, lplane_indexer) for x in arrs] + val = type(self.obj)._from_arrays(arrs, index=range(lplane_indexer), columns=range(len(arrs))) # - #breakpoint() - #self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), val) - #self.obj._clear_item_cache() - for loc, v in zip(ilocs, value): - self._setitem_single_column(loc, v, pi) + # 3 tests broken here fixed by using _setitem_single_column + self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), val) + self.obj._clear_item_cache() + #for loc, v in zip(ilocs, value): + # self._setitem_single_column(loc, v, pi) elif len(ilocs) == 1 and com.is_null_slice(pi) and len(self.obj) == 0: # This is a setitem-with-expansion, see diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index cbf4f066598dc..b6c8ebbe2e86f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -76,6 +76,7 @@ check_setitem_lengths, is_empty_indexer, is_exact_shape_match, + length_of_indexer, is_scalar_indexer, ) import pandas.core.missing as missing @@ -1678,12 +1679,25 @@ def setitem(self, indexer, value): # we are always 1-D indexer = indexer[0] - if isinstance(value, (np.ndarray, ExtensionArray)) and value.ndim == self.ndim == 2: + if isinstance(indexer, np.ndarray) and self.ndim == indexer.ndim == 2: + # possibly constructed with maybe_convert_ix + + indexer = indexer.squeeze() + indexer = np.atleast_1d(indexer) + + if ( + isinstance(value, (np.ndarray, ExtensionArray)) + and value.ndim == self.ndim == 2 + ): # TODO: test for this value = value.T if value.shape[0] != 1: raise ValueError value = value[0] + elif isinstance(value, ABCDataFrame) and self.ndim == 2: + if value.shape[1] != 1: + raise ValueError + value = value._ixs(0, axis=1)._values check_setitem_lengths(indexer, value, self.values) self.values[indexer] = value @@ -2061,6 +2075,10 @@ def where(self, other, cond, errors="raise", axis: int = 0) -> List["Block"]: def _can_hold_element(self, element: Any) -> bool: arr = self.array_values() + if isinstance(element, ABCDataFrame) and element.shape[1] == 1: + element = element.iloc[:, 0] + # TODO: probably need to update _can_hold_element + try: arr._validate_setitem_value(element) return True diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 51ac5ddcd3f72..1abc73859061d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -600,7 +600,10 @@ def handle_block(blk: Block) -> List[Block]: blk_indexer = (pi, ilocs) blk_indexer = maybe_convert_ix(*blk_indexer) - if blk._can_hold_element(value_for_block) and (not blk.is_object or (is2d and value_for_block.shape[1] == blk.shape[0])): + if blk._can_hold_element(value_for_block) and ( + not blk.is_object + or (is2d and value_for_block.shape[1] == blk.shape[0]) + ): nb = blk.setitem(blk_indexer, value_for_block) nbs = [nb] diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py index 329987350b7a9..49021feb665f2 100644 --- a/pandas/tests/frame/indexing/test_categorical.py +++ b/pandas/tests/frame/indexing/test_categorical.py @@ -100,6 +100,7 @@ def test_loc_setitem_multiple_rows(self, dfs_for_assignment): df = orig.copy() df.loc["j":"k", :] = [["c", 2], ["c", 2]] + # TODO: split still-giant test def test_assigning_ops(self, dfs_for_assignment): # systematically test the assigning operations: # for all slicing ops: From 2624c2ed0e60ae2ee81968f123e5df2dd81a1763 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 15 Jan 2021 13:55:37 -0800 Subject: [PATCH 05/13] port test from ref-setitem-blockwise --- pandas/_testing/__init__.py | 8 + .../tests/frame/indexing/test_categorical.py | 320 ++++++------------ 2 files changed, 113 insertions(+), 215 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index b36e790f8023b..549a3c8e4a681 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -977,3 +977,11 @@ def loc(x): def iloc(x): return x.iloc + + +def at(x): + return x.at + + +def iat(x): + return x.iat diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py index 49021feb665f2..b3e0783d7388f 100644 --- a/pandas/tests/frame/indexing/test_categorical.py +++ b/pandas/tests/frame/indexing/test_categorical.py @@ -7,6 +7,9 @@ from pandas import Categorical, DataFrame, Index, Series import pandas._testing as tm +msg1 = "Cannot setitem on a Categorical with a new category, set the categories first" +msg2 = "Cannot set a Categorical with another, without identical categories" + class TestDataFrameIndexingCategorical: def test_assignment(self): @@ -55,83 +58,43 @@ def test_assignment(self): df = DataFrame(Series(cat)) @pytest.fixture - def dfs_for_assignment(self): + def orig(self): cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) idx = Index(["h", "i", "j", "k", "l", "m", "n"]) values = [1, 1, 1, 1, 1, 1, 1] orig = DataFrame({"cats": cats, "values": values}, index=idx) + return orig + + @pytest.fixture + def exp_single_row(self): + # The expected values if we change a single row + cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) + values1 = [1, 1, 2, 1, 1, 1, 1] + exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) + return exp_single_row + @pytest.fixture + def exp_multi_row(self): + # assign multiple rows (mixed values) (-> array) -> exp_multi_row # changed multiple rows cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) values2 = [1, 1, 2, 2, 1, 1, 1] exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) - return orig, exp_multi_row - - def test_iloc_setitem_multiple_rows(self, dfs_for_assignment): - # - assign multiple rows (mixed values) -> exp_multi_row - orig, exp_multi_row = dfs_for_assignment - df = orig.copy() - - df.iloc[2:4, :] = [["b", 2], ["b", 2]] - tm.assert_frame_equal(df, exp_multi_row) - - msg1 = ( - "Cannot setitem on a Categorical with a new category, " - "set the categories first" - ) - with pytest.raises(ValueError, match=msg1): - df = orig.copy() - df.iloc[2:4, :] = [["c", 2], ["c", 2]] - - def test_loc_setitem_multiple_rows(self, dfs_for_assignment): - # - assign multiple rows (mixed values) -> exp_multi_row - orig, exp_multi_row = dfs_for_assignment - df = orig.copy() - - df.loc["j":"k", :] = [["b", 2], ["b", 2]] - tm.assert_frame_equal(df, exp_multi_row) - - msg1 = ( - "Cannot setitem on a Categorical with a new category, " - "set the categories first" - ) - with pytest.raises(ValueError, match=msg1): - df = orig.copy() - df.loc["j":"k", :] = [["c", 2], ["c", 2]] - - # TODO: split still-giant test - def test_assigning_ops(self, dfs_for_assignment): - # systematically test the assigning operations: - # for all slicing ops: - # for value in categories and value not in categories: - - # - assign a single value -> exp_single_cats_value - - # - assign a complete row (mixed values) -> exp_single_row - - # assign multiple rows (mixed values) (-> array) -> exp_multi_row - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col - orig, exp_multi_row = dfs_for_assignment - - # the expected values - # changed single row - cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) - idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) - values1 = [1, 1, 2, 1, 1, 1, 1] - exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) + return exp_multi_row + @pytest.fixture + def exp_parts_cats_col(self): # changed part of the cats column cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) idx3 = Index(["h", "i", "j", "k", "l", "m", "n"]) values3 = [1, 1, 1, 1, 1, 1, 1] exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3) + return exp_parts_cats_col + @pytest.fixture + def exp_single_cats_value(self): # changed single value in cats col cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) idx4 = Index(["h", "i", "j", "k", "l", "m", "n"]) @@ -139,195 +102,129 @@ def test_assigning_ops(self, dfs_for_assignment): exp_single_cats_value = DataFrame( {"cats": cats4, "values": values4}, index=idx4 ) + return exp_single_cats_value - # iloc - # ############### - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.iloc[2, 0] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - df = orig.copy() - df.iloc[df.index == "j", 0] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - # - assign a single value not in the current categories set - msg1 = ( - "Cannot setitem on a Categorical with a new category, " - "set the categories first" - ) - msg2 = "Cannot set a Categorical with another, without identical categories" - with pytest.raises(ValueError, match=msg1): - df = orig.copy() - df.iloc[2, 0] = "c" - - # - assign a complete row (mixed values) -> exp_single_row - df = orig.copy() - df.iloc[2, :] = ["b", 2] - tm.assert_frame_equal(df, exp_single_row) - - # - assign a complete row (mixed values) not in categories set - with pytest.raises(ValueError, match=msg1): - df = orig.copy() - df.iloc[2, :] = ["c", 2] - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col + @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) + def test_loc_iloc_setitem_list_of_lists(self, orig, exp_multi_row, indexer): + # - assign multiple rows (mixed values) -> exp_multi_row df = orig.copy() - df.iloc[2:4, 0] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp_parts_cats_col) - with pytest.raises(ValueError, match=msg2): - # different categories -> not sure if this should fail or pass - df = orig.copy() - df.iloc[2:4, 0] = Categorical(list("bb"), categories=list("abc")) + key = slice(2, 4) + if indexer is tm.loc: + key = slice("j", "k") - with pytest.raises(ValueError, match=msg2): - # different values - df = orig.copy() - df.iloc[2:4, 0] = Categorical(list("cc"), categories=list("abc")) + indexer(df)[key, :] = [["b", 2], ["b", 2]] + tm.assert_frame_equal(df, exp_multi_row) - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col df = orig.copy() - df.iloc[2:4, 0] = ["b", "b"] - tm.assert_frame_equal(df, exp_parts_cats_col) - with pytest.raises(ValueError, match=msg1): - df.iloc[2:4, 0] = ["c", "c"] + indexer(df)[key, :] = [["c", 2], ["c", 2]] - # loc - # ############## + @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc, tm.at, tm.iat]) + def test_loc_iloc_at_iat_setitem_single_value_in_categories( + self, orig, exp_single_cats_value, indexer + ): # - assign a single value -> exp_single_cats_value df = orig.copy() - df.loc["j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - df = orig.copy() - df.loc[df.index == "j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - # - assign a single value not in the current categories set - with pytest.raises(ValueError, match=msg1): - df = orig.copy() - df.loc["j", "cats"] = "c" + key = (2, 0) + if indexer in [tm.loc, tm.at]: + key = (df.index[2], df.columns[0]) - # - assign a complete row (mixed values) -> exp_single_row - df = orig.copy() - df.loc["j", :] = ["b", 2] - tm.assert_frame_equal(df, exp_single_row) + # "b" is among the categories for df["cat"}] + indexer(df)[key] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) - # - assign a complete row (mixed values) not in categories set + # "c" is not among the categories for df["cat"] with pytest.raises(ValueError, match=msg1): - df = orig.copy() - df.loc["j", :] = ["c", 2] - - # assign a part of a column with dtype == categorical -> - # exp_parts_cats_col - df = orig.copy() - df.loc["j":"k", "cats"] = Categorical(["b", "b"], categories=["a", "b"]) - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError, match=msg2): - # different categories -> not sure if this should fail or pass - df = orig.copy() - df.loc["j":"k", "cats"] = Categorical( - ["b", "b"], categories=["a", "b", "c"] - ) - - with pytest.raises(ValueError, match=msg2): - # different values - df = orig.copy() - df.loc["j":"k", "cats"] = Categorical( - ["c", "c"], categories=["a", "b", "c"] - ) + indexer(df)[key] = "c" - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col + @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) + def test_loc_iloc_setitem_mask_single_value_in_categories( + self, orig, exp_single_cats_value, indexer + ): + # mask with single True df = orig.copy() - df.loc["j":"k", "cats"] = ["b", "b"] - tm.assert_frame_equal(df, exp_parts_cats_col) - with pytest.raises(ValueError, match=msg1): - df.loc["j":"k", "cats"] = ["c", "c"] + mask = df.index == "j" + key = 0 + if indexer is tm.loc: + key = df.columns[key] - # loc - # ############## - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.loc["j", df.columns[0]] = "b" + indexer(df)[mask, key] = "b" tm.assert_frame_equal(df, exp_single_cats_value) + @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) + def test_iloc_setitem_full_row_non_categorical_rhs( + self, orig, exp_single_row, indexer + ): + # - assign a complete row (mixed values) -> exp_single_row df = orig.copy() - df.loc[df.index == "j", df.columns[0]] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - # - assign a single value not in the current categories set - with pytest.raises(ValueError, match=msg1): - df = orig.copy() - df.loc["j", df.columns[0]] = "c" + key = 2 + if indexer is tm.loc: + key = df.index[2] - # - assign a complete row (mixed values) -> exp_single_row - df = orig.copy() - df.loc["j", :] = ["b", 2] + # not categorical dtype, but "b" _is_ among the categories for df["cat"] + indexer(df)[key, :] = ["b", 2] tm.assert_frame_equal(df, exp_single_row) - # - assign a complete row (mixed values) not in categories set + # "c" is not among the categories for df["cat"] with pytest.raises(ValueError, match=msg1): - df = orig.copy() - df.loc["j", :] = ["c", 2] + indexer(df)[key, :] = ["c", 2] + @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) + def test_loc_iloc_setitem_partial_col_categorical_rhs( + self, orig, exp_parts_cats_col, indexer + ): # assign a part of a column with dtype == categorical -> # exp_parts_cats_col df = orig.copy() - df.loc["j":"k", df.columns[0]] = Categorical(["b", "b"], categories=["a", "b"]) + + key = (slice(2, 4), 0) + if indexer is tm.loc: + key = (slice("j", "k"), df.columns[0]) + + # same categories as we currently have in df["cats"] + compat = Categorical(["b", "b"], categories=["a", "b"]) + indexer(df)[key] = compat tm.assert_frame_equal(df, exp_parts_cats_col) + # categories do not match df["cat"]'s, but "b" is among them + semi_compat = Categorical(list("bb"), categories=list("abc")) with pytest.raises(ValueError, match=msg2): - # different categories -> not sure if this should fail or pass - df = orig.copy() - df.loc["j":"k", df.columns[0]] = Categorical( - ["b", "b"], categories=["a", "b", "c"] - ) + # different categories but holdable values + # -> not sure if this should fail or pass + indexer(df)[key] = semi_compat + # categories do not match df["cat"]'s, and "c" is not among them + incompat = Categorical(list("cc"), categories=list("abc")) with pytest.raises(ValueError, match=msg2): # different values - df = orig.copy() - df.loc["j":"k", df.columns[0]] = Categorical( - ["c", "c"], categories=["a", "b", "c"] - ) - - # assign a part of a column with dtype != categorical -> - # exp_parts_cats_col - df = orig.copy() - df.loc["j":"k", df.columns[0]] = ["b", "b"] - tm.assert_frame_equal(df, exp_parts_cats_col) - - with pytest.raises(ValueError, match=msg1): - df.loc["j":"k", df.columns[0]] = ["c", "c"] + indexer(df)[key] = incompat - # iat + @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) + def test_loc_iloc_setitem_non_categorical_rhs( + self, orig, exp_parts_cats_col, indexer + ): + # assign a part of a column with dtype != categorical -> exp_parts_cats_col df = orig.copy() - df.iat[2, 0] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - # - assign a single value not in the current categories set - with pytest.raises(ValueError, match=msg1): - df = orig.copy() - df.iat[2, 0] = "c" + key = (slice(2, 4), 0) + if indexer is tm.loc: + key = (slice("j", "k"), df.columns[0]) - # at - # - assign a single value -> exp_single_cats_value - df = orig.copy() - df.at["j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) + # "b" is among the categories for df["cat"] + indexer(df)[key] = ["b", "b"] + tm.assert_frame_equal(df, exp_parts_cats_col) - # - assign a single value not in the current categories set + # "c" not part of the categories with pytest.raises(ValueError, match=msg1): - df = orig.copy() - df.at["j", "cats"] = "c" + indexer(df)[key] = ["c", "c"] + def test_setitem_mask_categorical(self, exp_multi_row): # fancy indexing + catsf = Categorical( ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"] ) @@ -341,19 +238,12 @@ def test_assigning_ops(self, dfs_for_assignment): ) assert return_value is None - df[df["cats"] == "c"] = ["b", 2] + mask = df["cats"] == "c" + df[mask] = ["b", 2] # category c is kept in .categories tm.assert_frame_equal(df, exp_fancy) - # set_value - df = orig.copy() - df.at["j", "cats"] = "b" - tm.assert_frame_equal(df, exp_single_cats_value) - - with pytest.raises(ValueError, match=msg1): - df = orig.copy() - df.at["j", "cats"] = "c" - + def test_loc_setitem_categorical_values_partial_column_slice(self): # Assigning a Category to parts of a int/... column uses the values of # the Categorical df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) From 51b90a39624f0a40bc8332e48f6437474a32cde7 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 15 Jan 2021 17:50:15 -0800 Subject: [PATCH 06/13] cleanup --- pandas/core/indexing.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 0e45594cad7c9..db7349d7b83ee 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -16,13 +16,13 @@ from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, + is_extension_array_dtype, is_hashable, is_integer, is_iterator, is_list_like, is_numeric_dtype, is_object_dtype, - is_extension_array_dtype, is_scalar, is_sequence, ) @@ -1690,14 +1690,8 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): elif is_extension_array_dtype(value): # TODO(EA2D): special case not needed with 2D EAs obj = type(self.obj)(value) - orig_mgr = self.obj._mgr.copy(deep=True) - #breakpoint() - new_mgr = self.obj._mgr.setitem2((pi, ilocs), obj) - self.obj._mgr = new_mgr - #self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), obj) - #self.obj._clear_item_cache() - # 1 test stil failing would be fixed by using _setitem_single_column - #self._setitem_single_column(ilocs[0], value, pi) + self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), obj) + self.obj._clear_item_cache() else: val = np.atleast_2d(value).T self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), val) @@ -1725,20 +1719,22 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): elif len(ilocs) == len(value): # We are setting multiple columns in a with one row which we broadcast if is_extension_array_dtype(value): # TODO: not hit - val = DataFrame.from_arrays([value], index=[0], columns=range(len(value))) + val = DataFrame.from_arrays( + [value], index=[0], columns=range(len(value)) + ) elif isinstance(value, np.ndarray): val = np.atleast_2d(value) else: # avoid numpy casting which can take e.g. ["b", 2] -> ["b", "2"] - #ser = self.obj._constructor_sliced(value) - #val = ser.to_frame().T val = type(self.obj)([value]) if lplane_indexer != 1: # broadcast to length of pi # TODO: EA compat for broadcast_to arrs = list(val._iter_column_arrays()) arrs = [np.broadcast_to(x, lplane_indexer) for x in arrs] - val = type(self.obj)._from_arrays(arrs, index=range(lplane_indexer), columns=range(len(arrs))) + val = type(self.obj)._from_arrays( + arrs, index=range(lplane_indexer), columns=range(len(arrs)) + ) self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), val) self.obj._clear_item_cache() @@ -1776,8 +1772,10 @@ def _setitem_with_indexer_2d_value(self, indexer, value): "Must have equal len keys and value when setting with an ndarray" ) - #self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), value) - #self.obj._clear_item_cache() + # wrap in DataFrame to coerce where appropriate + # obj = type(self.obj)(value) + # self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), value) + # self.obj._clear_item_cache() # need to make setitem2 re-coerce for i, loc in enumerate(ilocs): # setting with a list, re-coerces From 08d57baa45fcadf0dcd8bfd37bb5b815c072d4e2 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 15 Jan 2021 21:56:39 -0800 Subject: [PATCH 07/13] checkpoint passing --- pandas/core/indexing.py | 10 +++------- pandas/core/internals/blocks.py | 11 ++++++++++- pandas/core/internals/managers.py | 14 +++++++------- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index db7349d7b83ee..c9cd8a81a5d54 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1773,13 +1773,9 @@ def _setitem_with_indexer_2d_value(self, indexer, value): ) # wrap in DataFrame to coerce where appropriate - # obj = type(self.obj)(value) - # self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), value) - # self.obj._clear_item_cache() - # need to make setitem2 re-coerce - for i, loc in enumerate(ilocs): - # setting with a list, re-coerces - self._setitem_single_column(loc, value[:, i].tolist(), pi) + obj = type(self.obj)(value.tolist()) + self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), obj) + self.obj._clear_item_cache() def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str): ilocs = self._ensure_iterable_column_indexer(indexer[1]) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 0cd46a30ef935..4f9eb747a8e4d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -70,7 +70,7 @@ ) from pandas.core.base import PandasObject import pandas.core.common as com -from pandas.core.construction import extract_array +from pandas.core.construction import extract_array, ensure_wrapped_if_datetimelike from pandas.core.indexers import ( check_setitem_lengths, is_empty_indexer, @@ -913,6 +913,15 @@ def setitem(self, indexer, value): is_ea_value = False arr_value = np.array(value) + # TODO: why the ndim restriction here? + if self.dtype == object and arr_value.dtype.kind in ["m", "M"] and arr_value.size > 0 and self.ndim == 2: + # get Timestamp/Timedelta, numpy would cast to ints (yikes!) + # FIXME: np.asarray(dta, dtype=object), dta.to_numpy(object) + # both have the same wrong numpy behavior + arr_value = ensure_wrapped_if_datetimelike(arr_value) + arr_value = np.asarray(arr_value.astype(object)) + value = arr_value + if transpose: values = values.T diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f36a47b78e3e8..764f2b0421ad8 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -588,28 +588,28 @@ def handle_block(blk: Block) -> List[Block]: nbs = [blk] else: is2d = False - value_for_block = value + vfb = value # vfb -> value_for_block if getattr(value, "ndim", 0) == 2: is2d = True if isinstance(value, ABCDataFrame): # TODO: similar to what we have in BlockManager.apply? - value_for_block = value.iloc[:, rlocs] + vfb = value.iloc[:, rlocs] else: - value_for_block = value[:, rlocs] + vfb = value[:, rlocs] blk_indexer = (pi, ilocs) blk_indexer = maybe_convert_ix(*blk_indexer) - if blk._can_hold_element(value_for_block) and ( + if blk._can_hold_element(vfb) and ( not blk.is_object - or (is2d and value_for_block.shape[1] == blk.shape[0]) + or (is2d and vfb.shape[1] == blk.shape[0]) ): - nb = blk.setitem(blk_indexer, value_for_block) + nb = blk.setitem(blk_indexer, vfb) nbs = [nb] elif blk.shape[0] == 1: # casting - nb = blk.setitem(blk_indexer, value_for_block) + nb = blk.setitem(blk_indexer, vfb) nbs = [nb] else: From 19124fe07559de514bcb38256d7e07c9e73ab1ed Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 15 Jan 2021 22:44:44 -0800 Subject: [PATCH 08/13] checkpoint passing --- pandas/core/indexing.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c9cd8a81a5d54..7d00f2819ee42 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1744,7 +1744,10 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): # e.g. df = DataFrame(columns=["x", "y"]) # df["x"] = df["x"].astype(np.int64) # df.loc[:, "x"] = [1, 2, 3] - self._setitem_single_column(ilocs[0], value, pi) + + # Setting entire column, so swapping out + # GH#??? we may want to change this behavior + self.obj._iset_item(ilocs[0], value) else: raise ValueError( @@ -1795,19 +1798,8 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str elif not unique_cols and value.columns.equals(self.obj.columns): # We assume we are already aligned, see # test_iloc_setitem_frame_duplicate_columns_multiple_blocks - for loc in ilocs: - item = self.obj.columns[loc] - if item in value: - sub_indexer[1] = item - val = self._align_series( - (pi, item), - value.iloc[:, loc], - multiindex_indexer, - ) - else: - val = np.nan - - self._setitem_single_column(loc, val, pi) + self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), value) + self.obj._clear_item_cache() elif not unique_cols: raise ValueError("Setting with non-unique columns is not allowed.") From 96f566470bf21103b6870f25d5adcb2768e28ab3 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 16 Jan 2021 12:04:12 -0800 Subject: [PATCH 09/13] rename --- pandas/core/frame.py | 4 ++-- pandas/core/indexing.py | 14 +++++++------- pandas/core/internals/managers.py | 10 +++++----- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 866c8173b63ee..396002410920c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3219,7 +3219,7 @@ def _setitem_array(self, key, value): self._check_setitem_copy() self.iloc[indexer] = value else: - if isinstance(value, DataFrame): + if isinstance(value, DataFrame): # 7 test_string_array tests fail if this block is disabled if len(value.columns) != len(key): raise ValueError("Columns must be same length as key") for k1, k2 in zip(key, value.columns): @@ -3311,7 +3311,7 @@ def _set_item_mgr(self, key, value): if len(self): self._check_setitem_copy() - def _iset_item(self, loc: int, value): # only called from _setitem_single_column + def _iset_item(self, loc: int, value): value = self._sanitize_column(value) value = _maybe_atleast_2d(value) self._iset_item_mgr(loc, value) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ff1a1af4d1a8f..5a2990b1ebfe8 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1690,11 +1690,11 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): elif is_extension_array_dtype(value): # TODO(EA2D): special case not needed with 2D EAs obj = type(self.obj)(value) - self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), obj) + self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), obj) self.obj._clear_item_cache() else: val = np.atleast_2d(value).T - self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), val) + self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), val) self.obj._clear_item_cache() elif len(ilocs) == 1 and 0 != lplane_indexer != len(value): @@ -1735,7 +1735,7 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): val = type(self.obj)._from_arrays( arrs, index=range(lplane_indexer), columns=range(len(arrs)) ) - self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), val) + self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), val) self.obj._clear_item_cache() elif len(ilocs) == 1 and com.is_null_slice(pi) and len(self.obj) == 0: @@ -1758,7 +1758,7 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): else: # scalar value - self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), value) + self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), value) self.obj._clear_item_cache() def _setitem_with_indexer_2d_value(self, indexer, value): @@ -1777,7 +1777,7 @@ def _setitem_with_indexer_2d_value(self, indexer, value): # wrap in DataFrame to coerce where appropriate obj = type(self.obj)(value.tolist()) - self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), obj) + self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), obj) self.obj._clear_item_cache() def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str): @@ -1792,13 +1792,13 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str # We do not want to align the value in case of iloc GH#37728 if name == "iloc": - self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), value) + self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), value) self.obj._clear_item_cache() elif not unique_cols and value.columns.equals(self.obj.columns): # We assume we are already aligned, see # test_iloc_setitem_frame_duplicate_columns_multiple_blocks - self.obj._mgr = self.obj._mgr.setitem2((pi, ilocs), value) + self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), value) self.obj._clear_item_cache() elif not unique_cols: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index fe586582a83b4..f0906ef171bd8 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -567,7 +567,7 @@ def setitem(self, indexer, value) -> BlockManager: # TODO: could just operate inplace, so we dont end up swapping out # parent frame/series _mgr? - def setitem2(self, indexer, value) -> "BlockManager": + def setitem_blockwise(self, indexer, value) -> "BlockManager": result_blocks = [] # assuming for now 2D @@ -602,10 +602,10 @@ def handle_block(blk: Block) -> List[Block]: blk_indexer = (pi, ilocs) blk_indexer = maybe_convert_ix(*blk_indexer) - if blk._can_hold_element(vfb) and ( - not blk.is_object - or (is2d and vfb.shape[1] == blk.shape[0]) - ): + # without the extra condition we fail in tests.indexing.test_indexing:: test_astype_assignment, but + # that is doing `df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64) + # which i _think_ *should* be inplace, so should not be casting, which the test wants to do + if blk._can_hold_element(vfb) and (not blk.is_object or (is2d and vfb.shape[1] == blk.shape[0])): nb = blk.setitem(blk_indexer, vfb) nbs = [nb] From c1e0b0f8fb6e1b4d9ab9beb4ce18b19e4d5520b1 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 16 Jan 2021 14:40:46 -0800 Subject: [PATCH 10/13] REF: avoid going through iloc --- pandas/core/frame.py | 47 ++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 28 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 396002410920c..fd2b98e70beba 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3225,36 +3225,27 @@ def _setitem_array(self, key, value): for k1, k2 in zip(key, value.columns): self[k1] = value[k2] - elif all(is_hashable(x) for x in key) and any(x not in self.columns for x in key): - # We need to do this instead of going through iloc to ensure - # we get correct dtype for new columns - self._check_setitem_copy() - - # TODO: de-duplicate with some of whats in indexing.py - if is_scalar(value): - for i, x in enumerate(key): - self[x] = value - elif np.ndim(value) == 1: - for i, x in enumerate(key): - self[x] = value[i] - else: - value = np.asarray(value) # TODO: or DataFrame? - for i, x in enumerate(key): - self[x] = value[:, i] + elif not is_list_like(value): + for col in key: + self[col] = value + + elif isinstance(value, np.ndarray) and value.ndim == 2: + if value.shape[-1] != len(key): + raise ValueError("Columns must be same length as key") + + for i, col in enumerate(key): + self[col] = value[:, i] + + elif np.ndim(value) > 1: + # list of lists + value = DataFrame(value).values + return self._setitem_array(key, value) else: - self.loc._ensure_listlike_indexer(key, axis=1, value=value) - indexer = self.loc._get_listlike_indexer( - key, axis=1, raise_missing=False - )[1] - self._check_setitem_copy() - - if is_scalar(value): - indexer = self.iloc._ensure_iterable_column_indexer(indexer) - for i in indexer: - self[self.columns[i]] = value - else: - self.iloc[:, indexer] = value # TODO: indicate not-inplace + if len(value) != len(key): + raise ValueError("Columns must be same length as key") + for i, col in enumerate(key): + self[col] = value[i] def _setitem_frame(self, key, value): # support boolean setting with DataFrame input, e.g. From 83f554588241e33393cf9529fc38042817aa502e Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 17 Jan 2021 11:14:35 -0800 Subject: [PATCH 11/13] port from other PRs --- pandas/core/dtypes/missing.py | 4 --- pandas/core/indexers.py | 15 ++++++++++ pandas/core/internals/blocks.py | 11 +++++-- pandas/core/internals/managers.py | 20 +++++++------ pandas/tests/extension/test_numpy.py | 44 +++++++++++++++++++++++++++- 5 files changed, 77 insertions(+), 17 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index a30875a2783d2..6bf696f39d317 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -515,10 +515,6 @@ def infer_fill_value(val, length: int): if not is_list_like(val): val = [val] - if type(val).__name__ == "PandasArray": - # for test_numpy test where we patch PandasArray._typ - val = val.to_numpy() - if is_extension_array_dtype(val): # We cannot use dtype._na_value bc pd.NA/pd.NaT do not preserve dtype if len(val) == length: diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 79479c6db8d9d..8d065daf63204 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -222,6 +222,21 @@ def validate_indices(indices: np.ndarray, n: int) -> None: # Indexer Conversion +def ensure_iterable_indexer(ncols: int, column_indexer): + """ + Ensure that our column indexer is something that can be iterated over. + """ + if is_integer(column_indexer): + ilocs = [column_indexer] + elif isinstance(column_indexer, slice): + ilocs = np.arange(ncols)[column_indexer] + elif isinstance(column_indexer, np.ndarray) and is_bool_dtype(column_indexer.dtype): + ilocs = np.arange(len(column_indexer))[column_indexer] + else: + ilocs = column_indexer + return ilocs + + def maybe_convert_indices(indices, n: int): """ Attempt to convert indices into valid, positive indices. diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1e7b8b1fd0899..2b222f8ae2a8e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -72,12 +72,11 @@ ) from pandas.core.base import PandasObject import pandas.core.common as com -from pandas.core.construction import extract_array, ensure_wrapped_if_datetimelike +from pandas.core.construction import ensure_wrapped_if_datetimelike, extract_array from pandas.core.indexers import ( check_setitem_lengths, is_empty_indexer, is_exact_shape_match, - length_of_indexer, is_scalar_indexer, ) import pandas.core.missing as missing @@ -916,7 +915,12 @@ def setitem(self, indexer, value): arr_value = np.array(value) # TODO: why the ndim restriction here? - if self.dtype == object and arr_value.dtype.kind in ["m", "M"] and arr_value.size > 0 and self.ndim == 2: + if ( + self.dtype == object + and arr_value.dtype.kind in ["m", "M"] + and arr_value.size > 0 + and self.ndim == 2 + ): # get Timestamp/Timedelta, numpy would cast to ints (yikes!) # FIXME: np.asarray(dta, dtype=object), dta.to_numpy(object) # both have the same wrong numpy behavior @@ -947,6 +951,7 @@ def setitem(self, indexer, value): return self.astype(dtype).setitem(indexer, value) if isinstance(indexer, tuple) and len(indexer) == self.ndim: + # test_loc_setitem_consistency, test_loc_setitem_consistency_dt64_to_float if com.is_null_slice(indexer[0]): value2 = lib.item_from_zerodim(value) if lib.is_scalar(value2): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f0906ef171bd8..e49bea75cf730 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -40,12 +40,12 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCPandasArray, ABCSeries from pandas.core.dtypes.missing import array_equals, isna -from pandas.core.indexing import maybe_convert_ix import pandas.core.algorithms as algos from pandas.core.arrays.sparse import SparseDtype from pandas.core.construction import extract_array -from pandas.core.indexers import maybe_convert_indices +from pandas.core.indexers import ensure_iterable_indexer, maybe_convert_indices from pandas.core.indexes.api import Index, ensure_index +from pandas.core.indexing import maybe_convert_ix from pandas.core.internals.base import DataManager from pandas.core.internals.blocks import ( Block, @@ -567,24 +567,24 @@ def setitem(self, indexer, value) -> BlockManager: # TODO: could just operate inplace, so we dont end up swapping out # parent frame/series _mgr? - def setitem_blockwise(self, indexer, value) -> "BlockManager": + def setitem_blockwise(self, indexer, value) -> BlockManager: result_blocks = [] # assuming for now 2D pi, col_indexer = indexer - if lib.is_integer(col_indexer): - col_indexer = [col_indexer] + col_indexer = ensure_iterable_indexer(len(self.items), col_indexer) col_indexer = Index(col_indexer) col_indexer2 = list(col_indexer) def handle_block(blk: Block) -> List[Block]: locs = Index(blk.mgr_locs.as_array).intersection(col_indexer) ilocs = [list(blk.mgr_locs).index(x) for x in locs] - # iloc2 = self.blklocs[locs] - # assert (ilocs == ilocs2).all(), (ilocs, ilocs2) - # this assertion works for non-recursed blocks + # For blocks that are among self.blocks (i.e. not reached via recursion) + # this should match self.blklocs[locs] rlocs = [col_indexer2.index(x) for x in locs] + rlocs2 = col_indexer.get_indexer(locs) + assert (rlocs2 == rlocs).all() if not len(ilocs): nbs = [blk] @@ -605,7 +605,9 @@ def handle_block(blk: Block) -> List[Block]: # without the extra condition we fail in tests.indexing.test_indexing:: test_astype_assignment, but # that is doing `df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64) # which i _think_ *should* be inplace, so should not be casting, which the test wants to do - if blk._can_hold_element(vfb) and (not blk.is_object or (is2d and vfb.shape[1] == blk.shape[0])): + if blk._can_hold_element(vfb) and ( + not blk.is_object or (is2d and vfb.shape[1] == blk.shape[0]) + ): nb = blk.setitem(blk_indexer, vfb) nbs = [nb] diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 1f0181eec8830..d4cf173b4463e 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -16,9 +16,12 @@ import numpy as np import pytest +from pandas.core.dtypes.missing import infer_fill_value as infer_fill_value_orig + import pandas as pd import pandas._testing as tm -from pandas.core.arrays.numpy_ import PandasArray, PandasDtype +from pandas.core.arrays import PandasArray, PandasDtype, StringArray +from pandas.core.construction import extract_array from . import base @@ -28,6 +31,31 @@ def dtype(request): return PandasDtype(np.dtype(request.param)) +orig_setitem = pd.core.internals.Block.setitem + + +def setitem(self, indexer, value): + # patch Block.setitem + value = extract_array(value, extract_numpy=True) + if isinstance(value, PandasArray) and not isinstance(value, StringArray): + value = value.to_numpy() + if self.ndim == 2 and value.ndim == 1: + # TODO(EA2D): special case not needed with 2D EAs + value = np.atleast_2d(value) + + return orig_setitem(self, indexer, value) + + +def infer_fill_value(val, length: int): + # GH#39044 we have to patch core.dtypes.missing.infer_fill_value + # to unwrap PandasArray bc it won't recognize PandasArray with + # is_extension_dtype + if isinstance(val, PandasArray): + val = val.to_numpy() + + return infer_fill_value_orig(val, length) + + @pytest.fixture def allow_in_pandas(monkeypatch): """ @@ -47,6 +75,8 @@ def allow_in_pandas(monkeypatch): """ with monkeypatch.context() as m: m.setattr(PandasArray, "_typ", "extension") + m.setattr(pd.core.indexing, "infer_fill_value", infer_fill_value) + m.setattr(pd.core.internals.Block, "setitem", setitem) yield @@ -501,6 +531,18 @@ def test_setitem_slice(self, data, box_in_series): def test_setitem_loc_iloc_slice(self, data): super().test_setitem_loc_iloc_slice(data) + def test_setitem_series(self, data, full_indexer): + # https://github.com/pandas-dev/pandas/issues/32395 + ser = pd.Series(data, name="data") + result = pd.Series(index=ser.index, dtype=object, name="data") + + key = full_indexer(ser) + result.loc[key] = ser + + # For PandasArray we expect to get unboxed to numpy + expected = pd.Series(data.to_numpy(), name="data") + self.assert_series_equal(result, expected) + @skip_nested class TestParsing(BaseNumPyTests, base.BaseParsingTests): From 871844461f1a82f63354f2d67c8ca3d90288a810 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 18 Jan 2021 13:19:20 -0800 Subject: [PATCH 12/13] cleanup --- pandas/core/internals/managers.py | 8 +++----- pandas/core/series.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index e49bea75cf730..86ef4068b3bfd 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -575,16 +575,14 @@ def setitem_blockwise(self, indexer, value) -> BlockManager: col_indexer = ensure_iterable_indexer(len(self.items), col_indexer) col_indexer = Index(col_indexer) - col_indexer2 = list(col_indexer) def handle_block(blk: Block) -> List[Block]: locs = Index(blk.mgr_locs.as_array).intersection(col_indexer) - ilocs = [list(blk.mgr_locs).index(x) for x in locs] # For blocks that are among self.blocks (i.e. not reached via recursion) # this should match self.blklocs[locs] - rlocs = [col_indexer2.index(x) for x in locs] - rlocs2 = col_indexer.get_indexer(locs) - assert (rlocs2 == rlocs).all() + ilocs = [list(blk.mgr_locs).index(x) for x in locs] + + rlocs = col_indexer.get_indexer(locs) if not len(ilocs): nbs = [blk] diff --git a/pandas/core/series.py b/pandas/core/series.py index bfff05997eba9..3888194305d76 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1022,7 +1022,7 @@ def _set_labels(self, key, value): def _set_values(self, key, value): if isinstance(key, Series): - key = key._values # TODO: has this necessarily been aligned? + key = key._values self._mgr = self._mgr.setitem( # type: ignore[assignment] indexer=key, value=value ) From 604f796f9a78d572ec784334f862a1a6d2fbc7ee Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 20 Jan 2021 09:25:39 -0800 Subject: [PATCH 13/13] cleanup --- pandas/core/indexing.py | 8 +------- pandas/core/internals/blocks.py | 29 ++++++++++++++++++++--------- pandas/core/internals/managers.py | 6 ++++-- 3 files changed, 25 insertions(+), 18 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 7c52c17847f5d..10c21ff53d9ab 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1679,15 +1679,10 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): # Setting entire column, so swapping out # GH#??? we may want to change this behavior self.obj._iset_item(ilocs[0], value) - elif is_extension_array_dtype(value): - # TODO(EA2D): special case not needed with 2D EAs + else: obj = type(self.obj)(value) self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), obj) self.obj._clear_item_cache() - else: - val = np.atleast_2d(value).T - self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), val) - self.obj._clear_item_cache() elif len(ilocs) == 1 and 0 != lplane_indexer != len(value): # We are trying to set N values into M entries of a single @@ -1775,7 +1770,6 @@ def _setitem_with_indexer_2d_value(self, indexer, value): def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str): ilocs = self._ensure_iterable_column_indexer(indexer[1]) - sub_indexer = list(indexer) pi = indexer[0] multiindex_indexer = isinstance(self.obj.columns, ABCMultiIndex) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 01c4dd3917257..62ec2c3b9e798 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -903,6 +903,7 @@ def setitem(self, indexer, value): values = self.values + # FIXME: avoid getting here with DataFrame value; ambiguous casting if is_extension_array_dtype(getattr(value, "dtype", None)): # We need to be careful not to allow through strings that # can be parsed to EADtypes @@ -935,6 +936,25 @@ def setitem(self, indexer, value): if not self._can_hold_element(value): # current dtype cannot store value, coerce to common dtype + + is_full = exact_match or ( + isinstance(indexer, tuple) + and len(indexer) == self.ndim + and com.is_null_slice(indexer[0]) + ) + if is_full: + # test_loc_setitem_consistency, + # test_loc_setitem_consistency_dt64_to_float + value2 = lib.item_from_zerodim(value) + if lib.is_scalar(value2): + # TODO: de-duplicate with similar in setitem_single_block + value2 = np.full(self.shape, arr_value) + return self.make_block(value2) + elif arr_value.shape == self.shape[::-1]: + return self.make_block(arr_value.T) + else: + assert False # just checking we never get here + # TODO: can we just use coerce_to_target_dtype for all this if hasattr(value, "dtype"): dtype = value.dtype @@ -948,15 +968,6 @@ def setitem(self, indexer, value): dtype, _ = maybe_promote(np.array(value).dtype) return self.astype(dtype).setitem(indexer, value) - if isinstance(indexer, tuple) and len(indexer) == self.ndim: - # test_loc_setitem_consistency, test_loc_setitem_consistency_dt64_to_float - if com.is_null_slice(indexer[0]): - value2 = lib.item_from_zerodim(value) - if lib.is_scalar(value2): - # TODO: de-duplicate with similar in setitem_single_block - value2 = np.full(self.shape, arr_value) - return self.make_block(value2) - dtype = find_common_type([values.dtype, dtype]) assert not is_dtype_equal(self.dtype, dtype) # otherwise should have _can_hold_element diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b46d7b10af351..da05d90a9beaa 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -600,9 +600,11 @@ def handle_block(blk: Block) -> List[Block]: blk_indexer = (pi, ilocs) blk_indexer = maybe_convert_ix(*blk_indexer) - # without the extra condition we fail in tests.indexing.test_indexing:: test_astype_assignment, but + # without the extra condition we fail in + # tests.indexing.test_indexing:: test_astype_assignment, but # that is doing `df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64) - # which i _think_ *should* be inplace, so should not be casting, which the test wants to do + # which i _think_ *should* be inplace, so should not be casting, + # which the test wants to do if blk._can_hold_element(vfb) and ( not blk.is_object or (is2d and vfb.shape[1] == blk.shape[0]) ):