diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ef93e80d83d04..fa8bc5ce42685 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -60,6 +60,7 @@ pandas_dtype, ) from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna +from pandas.core.dtypes.generic import ABCDataFrame from pandas.core import nanops, ops from pandas.core.algorithms import checked_add_with_arr, isin, unique1d, value_counts @@ -613,6 +614,11 @@ def _validate_listlike(self, value, allow_object: bool = False): # We treat empty list as our own dtype. return type(self)._from_sequence([], dtype=self.dtype) + if isinstance(value, ABCDataFrame) and value.shape[1] == 1: + # FIXME: kludge + res = self._validate_listlike(value._ixs(0, axis=1), allow_object=allow_object) + return res.reshape(-1, 1) + if hasattr(value, "dtype") and value.dtype == object: # `array` below won't do inference if value is an Index or Series. # so do so here. in the Index case, inferred_type may be cached. diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index f0455c01fa085..6bf696f39d317 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -506,7 +506,7 @@ def array_equals(left: ArrayLike, right: ArrayLike) -> bool: return array_equivalent(left, right, dtype_equal=True) -def infer_fill_value(val): +def infer_fill_value(val, length: int): """ infer the fill value for the nan/NaT from the provided scalar/ndarray/list-like if we are a NaT, return the correct dtyped @@ -514,6 +514,19 @@ def infer_fill_value(val): """ if not is_list_like(val): val = [val] + + if is_extension_array_dtype(val): + # We cannot use dtype._na_value bc pd.NA/pd.NaT do not preserve dtype + if len(val) == length: + # TODO: in this case see if we can avoid making a copy later on + return val + if length == 0: + return val[:0].copy() + + dtype = val.dtype + cls = dtype.construct_array_type() + return cls._from_sequence([dtype._na_value], dtype=dtype).repeat(length) + val = np.array(val, copy=False) if needs_i8_conversion(val.dtype): return np.array("NaT", dtype=val.dtype) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0a1ea4041a10b..9ec0b2588b3c7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3209,6 +3209,7 @@ def _setitem_slice(self, key: slice, value): def _setitem_array(self, key, value): # also raises Exception if object array with NA values if com.is_bool_indexer(key): + # bool indexer is indexing along rows if len(key) != len(self.index): raise ValueError( f"Item wrong length {len(key)} instead of {len(self.index)}!" @@ -3218,18 +3219,33 @@ def _setitem_array(self, key, value): self._check_setitem_copy() self.iloc[indexer] = value else: - if isinstance(value, DataFrame): + if isinstance(value, DataFrame): # 7 test_string_array tests fail if this block is disabled if len(value.columns) != len(key): raise ValueError("Columns must be same length as key") for k1, k2 in zip(key, value.columns): self[k1] = value[k2] + + elif not is_list_like(value): + for col in key: + self[col] = value + + elif isinstance(value, np.ndarray) and value.ndim == 2: + if value.shape[-1] != len(key): + raise ValueError("Columns must be same length as key") + + for i, col in enumerate(key): + self[col] = value[:, i] + + elif np.ndim(value) > 1: + # list of lists + value = DataFrame(value).values + return self._setitem_array(key, value) + else: - self.loc._ensure_listlike_indexer(key, axis=1, value=value) - indexer = self.loc._get_listlike_indexer( - key, axis=1, raise_missing=False - )[1] - self._check_setitem_copy() - self.iloc[:, indexer] = value + if len(value) != len(key): + raise ValueError("Columns must be same length as key") + for i, col in enumerate(key): + self[col] = value[i] def _setitem_frame(self, key, value): # support boolean setting with DataFrame input, e.g. diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 399953fc17c73..e1365ac77d7b7 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -221,6 +221,21 @@ def validate_indices(indices: np.ndarray, n: int) -> None: # Indexer Conversion +def ensure_iterable_indexer(ncols: int, column_indexer): + """ + Ensure that our column indexer is something that can be iterated over. + """ + if is_integer(column_indexer): + ilocs = [column_indexer] + elif isinstance(column_indexer, slice): + ilocs = np.arange(ncols)[column_indexer] + elif isinstance(column_indexer, np.ndarray) and is_bool_dtype(column_indexer.dtype): + ilocs = np.arange(len(column_indexer))[column_indexer] + else: + ilocs = column_indexer + return ilocs + + def maybe_convert_indices(indices, n: int): """ Attempt to convert indices into valid, positive indices. diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c98242cae23f3..10c21ff53d9ab 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -16,6 +16,7 @@ from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, + is_extension_array_dtype, is_hashable, is_integer, is_iterator, @@ -1595,7 +1596,7 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): # We are setting an entire column self.obj[key] = value else: - self.obj[key] = infer_fill_value(value) + self.obj[key] = infer_fill_value(value, len(self.obj)) new_indexer = convert_from_missing_indexer_tuple( indexer, self.obj.axes @@ -1674,7 +1675,14 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi): # We are setting multiple rows in a single column. - self._setitem_single_column(ilocs[0], value, pi) + if len(value) == len(self.obj): + # Setting entire column, so swapping out + # GH#??? we may want to change this behavior + self.obj._iset_item(ilocs[0], value) + else: + obj = type(self.obj)(value) + self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), obj) + self.obj._clear_item_cache() elif len(ilocs) == 1 and 0 != lplane_indexer != len(value): # We are trying to set N values into M entries of a single @@ -1696,9 +1704,26 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): pass elif len(ilocs) == len(value): - # We are setting multiple columns in a single row. - for loc, v in zip(ilocs, value): - self._setitem_single_column(loc, v, pi) + # We are setting multiple columns in a with one row which we broadcast + if is_extension_array_dtype(value): # TODO: not hit + val = DataFrame.from_arrays( + [value], index=[0], columns=range(len(value)) + ) + elif isinstance(value, np.ndarray): + val = np.atleast_2d(value) + else: + # avoid numpy casting which can take e.g. ["b", 2] -> ["b", "2"] + val = type(self.obj)([value]) + if lplane_indexer != 1: + # broadcast to length of pi + # TODO: EA compat for broadcast_to + arrs = list(val._iter_column_arrays()) + arrs = [np.broadcast_to(x, lplane_indexer) for x in arrs] + val = type(self.obj)._from_arrays( + arrs, index=range(lplane_indexer), columns=range(len(arrs)) + ) + self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), val) + self.obj._clear_item_cache() elif len(ilocs) == 1 and com.is_null_slice(pi) and len(self.obj) == 0: # This is a setitem-with-expansion, see @@ -1706,7 +1731,10 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): # e.g. df = DataFrame(columns=["x", "y"]) # df["x"] = df["x"].astype(np.int64) # df.loc[:, "x"] = [1, 2, 3] - self._setitem_single_column(ilocs[0], value, pi) + + # Setting entire column, so swapping out + # GH#??? we may want to change this behavior + self.obj._iset_item(ilocs[0], value) else: raise ValueError( @@ -1717,8 +1745,8 @@ def _setitem_with_indexer_split_path(self, indexer, value, name: str): else: # scalar value - for loc in ilocs: - self._setitem_single_column(loc, value, pi) + self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), value) + self.obj._clear_item_cache() def _setitem_with_indexer_2d_value(self, indexer, value): # We get here with np.ndim(value) == 2, excluding DataFrame, @@ -1734,14 +1762,14 @@ def _setitem_with_indexer_2d_value(self, indexer, value): "Must have equal len keys and value when setting with an ndarray" ) - for i, loc in enumerate(ilocs): - # setting with a list, re-coerces - self._setitem_single_column(loc, value[:, i].tolist(), pi) + # wrap in DataFrame to coerce where appropriate + obj = type(self.obj)(value.tolist()) + self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), obj) + self.obj._clear_item_cache() def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str): ilocs = self._ensure_iterable_column_indexer(indexer[1]) - sub_indexer = list(indexer) pi = indexer[0] multiindex_indexer = isinstance(self.obj.columns, ABCMultiIndex) @@ -1750,26 +1778,14 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str # We do not want to align the value in case of iloc GH#37728 if name == "iloc": - for i, loc in enumerate(ilocs): - val = value.iloc[:, i] - self._setitem_single_column(loc, val, pi) + self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), value) + self.obj._clear_item_cache() elif not unique_cols and value.columns.equals(self.obj.columns): # We assume we are already aligned, see # test_iloc_setitem_frame_duplicate_columns_multiple_blocks - for loc in ilocs: - item = self.obj.columns[loc] - if item in value: - sub_indexer[1] = item - val = self._align_series( - tuple(sub_indexer), - value.iloc[:, loc], - multiindex_indexer, - ) - else: - val = np.nan - - self._setitem_single_column(loc, val, pi) + self.obj._mgr = self.obj._mgr.setitem_blockwise((pi, ilocs), value) + self.obj._clear_item_cache() elif not unique_cols: raise ValueError("Setting with non-unique columns is not allowed.") @@ -1778,9 +1794,8 @@ def _setitem_with_indexer_frame_value(self, indexer, value: DataFrame, name: str for loc in ilocs: item = self.obj.columns[loc] if item in value: - sub_indexer[1] = item val = self._align_series( - tuple(sub_indexer), value[item], multiindex_indexer + (pi, item), value[item], multiindex_indexer ) else: val = np.nan diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6563903adf9bb..62ec2c3b9e798 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -72,7 +72,7 @@ ) from pandas.core.base import PandasObject import pandas.core.common as com -from pandas.core.construction import extract_array +from pandas.core.construction import ensure_wrapped_if_datetimelike, extract_array from pandas.core.indexers import ( check_setitem_lengths, is_empty_indexer, @@ -901,10 +901,60 @@ def setitem(self, indexer, value): if self.is_numeric: value = np.nan - # coerce if block dtype can store value values = self.values + + # FIXME: avoid getting here with DataFrame value; ambiguous casting + if is_extension_array_dtype(getattr(value, "dtype", None)): + # We need to be careful not to allow through strings that + # can be parsed to EADtypes + is_ea_value = True + arr_value = value + else: + is_ea_value = False + arr_value = np.array(value) + + # TODO: why the ndim restriction here? + if ( + self.dtype == object + and arr_value.dtype.kind in ["m", "M"] + and arr_value.size > 0 + and self.ndim == 2 + ): + # get Timestamp/Timedelta, numpy would cast to ints (yikes!) + # FIXME: np.asarray(dta, dtype=object), dta.to_numpy(object) + # both have the same wrong numpy behavior + arr_value = ensure_wrapped_if_datetimelike(arr_value) + arr_value = np.asarray(arr_value.astype(object)) + value = arr_value + + if transpose: + values = values.T + + # length checking + check_setitem_lengths(indexer, value, values) + exact_match = is_exact_shape_match(values, arr_value) + if not self._can_hold_element(value): # current dtype cannot store value, coerce to common dtype + + is_full = exact_match or ( + isinstance(indexer, tuple) + and len(indexer) == self.ndim + and com.is_null_slice(indexer[0]) + ) + if is_full: + # test_loc_setitem_consistency, + # test_loc_setitem_consistency_dt64_to_float + value2 = lib.item_from_zerodim(value) + if lib.is_scalar(value2): + # TODO: de-duplicate with similar in setitem_single_block + value2 = np.full(self.shape, arr_value) + return self.make_block(value2) + elif arr_value.shape == self.shape[::-1]: + return self.make_block(arr_value.T) + else: + assert False # just checking we never get here + # TODO: can we just use coerce_to_target_dtype for all this if hasattr(value, "dtype"): dtype = value.dtype @@ -930,21 +980,6 @@ def setitem(self, indexer, value): return self # value must be storable at this moment - if is_extension_array_dtype(getattr(value, "dtype", None)): - # We need to be careful not to allow through strings that - # can be parsed to EADtypes - is_ea_value = True - arr_value = value - else: - is_ea_value = False - arr_value = np.array(value) - - if transpose: - values = values.T - - # length checking - check_setitem_lengths(indexer, value, values) - exact_match = is_exact_shape_match(values, arr_value) if is_empty_indexer(indexer, arr_value): # GH#8669 empty indexers pass @@ -1666,6 +1701,26 @@ def setitem(self, indexer, value): # we are always 1-D indexer = indexer[0] + if isinstance(indexer, np.ndarray) and self.ndim == indexer.ndim == 2: + # possibly constructed with maybe_convert_ix + + indexer = indexer.squeeze() + indexer = np.atleast_1d(indexer) + + if ( + isinstance(value, (np.ndarray, ExtensionArray)) + and value.ndim == self.ndim == 2 + ): + # TODO: test for this + value = value.T + if value.shape[0] != 1: + raise ValueError + value = value[0] + elif isinstance(value, ABCDataFrame) and self.ndim == 2: + if value.shape[1] != 1: + raise ValueError + value = value._ixs(0, axis=1)._values + check_setitem_lengths(indexer, value, self.values) self.values[indexer] = value return self diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f973ceb940415..da05d90a9beaa 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -43,8 +43,9 @@ import pandas.core.algorithms as algos from pandas.core.arrays.sparse import SparseDtype from pandas.core.construction import extract_array -from pandas.core.indexers import maybe_convert_indices +from pandas.core.indexers import ensure_iterable_indexer, maybe_convert_indices from pandas.core.indexes.api import Index, ensure_index +from pandas.core.indexing import maybe_convert_ix from pandas.core.internals.base import DataManager from pandas.core.internals.blocks import ( Block, @@ -564,6 +565,74 @@ def where(self, other, cond, align: bool, errors: str, axis: int) -> BlockManage def setitem(self, indexer, value) -> BlockManager: return self.apply("setitem", indexer=indexer, value=value) + # TODO: could just operate inplace, so we dont end up swapping out + # parent frame/series _mgr? + def setitem_blockwise(self, indexer, value) -> BlockManager: + result_blocks = [] + + # assuming for now 2D + pi, col_indexer = indexer + + col_indexer = ensure_iterable_indexer(len(self.items), col_indexer) + col_indexer = Index(col_indexer) + + def handle_block(blk: Block) -> List[Block]: + locs = Index(blk.mgr_locs.as_array).intersection(col_indexer) + # For blocks that are among self.blocks (i.e. not reached via recursion) + # this should match self.blklocs[locs] + ilocs = [list(blk.mgr_locs).index(x) for x in locs] + + rlocs = col_indexer.get_indexer(locs) + + if not len(ilocs): + nbs = [blk] + else: + is2d = False + vfb = value # vfb -> value_for_block + if getattr(value, "ndim", 0) == 2: + is2d = True + if isinstance(value, ABCDataFrame): + # TODO: similar to what we have in BlockManager.apply? + vfb = value.iloc[:, rlocs] + else: + vfb = value[:, rlocs] + + blk_indexer = (pi, ilocs) + blk_indexer = maybe_convert_ix(*blk_indexer) + + # without the extra condition we fail in + # tests.indexing.test_indexing:: test_astype_assignment, but + # that is doing `df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64) + # which i _think_ *should* be inplace, so should not be casting, + # which the test wants to do + if blk._can_hold_element(vfb) and ( + not blk.is_object or (is2d and vfb.shape[1] == blk.shape[0]) + ): + nb = blk.setitem(blk_indexer, vfb) + nbs = [nb] + + elif blk.shape[0] == 1: + # casting + nb = blk.setitem(blk_indexer, vfb) + nbs = [nb] + + else: + # recurse -> operate column-wise + blocks = blk._split() + nbs = [] + for subblk in blocks: + nbs2 = handle_block(subblk) + nbs.extend(nbs2) + + return nbs + + for blk in self.blocks: + nbs = handle_block(blk) + + result_blocks.extend(nbs) + + return type(self).from_blocks(result_blocks, self.axes) + def putmask(self, mask, new, align: bool = True): if align: diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 1f0181eec8830..d4cf173b4463e 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -16,9 +16,12 @@ import numpy as np import pytest +from pandas.core.dtypes.missing import infer_fill_value as infer_fill_value_orig + import pandas as pd import pandas._testing as tm -from pandas.core.arrays.numpy_ import PandasArray, PandasDtype +from pandas.core.arrays import PandasArray, PandasDtype, StringArray +from pandas.core.construction import extract_array from . import base @@ -28,6 +31,31 @@ def dtype(request): return PandasDtype(np.dtype(request.param)) +orig_setitem = pd.core.internals.Block.setitem + + +def setitem(self, indexer, value): + # patch Block.setitem + value = extract_array(value, extract_numpy=True) + if isinstance(value, PandasArray) and not isinstance(value, StringArray): + value = value.to_numpy() + if self.ndim == 2 and value.ndim == 1: + # TODO(EA2D): special case not needed with 2D EAs + value = np.atleast_2d(value) + + return orig_setitem(self, indexer, value) + + +def infer_fill_value(val, length: int): + # GH#39044 we have to patch core.dtypes.missing.infer_fill_value + # to unwrap PandasArray bc it won't recognize PandasArray with + # is_extension_dtype + if isinstance(val, PandasArray): + val = val.to_numpy() + + return infer_fill_value_orig(val, length) + + @pytest.fixture def allow_in_pandas(monkeypatch): """ @@ -47,6 +75,8 @@ def allow_in_pandas(monkeypatch): """ with monkeypatch.context() as m: m.setattr(PandasArray, "_typ", "extension") + m.setattr(pd.core.indexing, "infer_fill_value", infer_fill_value) + m.setattr(pd.core.internals.Block, "setitem", setitem) yield @@ -501,6 +531,18 @@ def test_setitem_slice(self, data, box_in_series): def test_setitem_loc_iloc_slice(self, data): super().test_setitem_loc_iloc_slice(data) + def test_setitem_series(self, data, full_indexer): + # https://github.com/pandas-dev/pandas/issues/32395 + ser = pd.Series(data, name="data") + result = pd.Series(index=ser.index, dtype=object, name="data") + + key = full_indexer(ser) + result.loc[key] = ser + + # For PandasArray we expect to get unboxed to numpy + expected = pd.Series(data.to_numpy(), name="data") + self.assert_series_equal(result, expected) + @skip_nested class TestParsing(BaseNumPyTests, base.BaseParsingTests):