diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index ef645313de614..a2f200192eacb 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -505,14 +505,28 @@ def array_equals(left: ArrayLike, right: ArrayLike) -> bool: return array_equivalent(left, right, dtype_equal=True) -def infer_fill_value(val): +def infer_fill_value(val, length: int): """ - infer the fill value for the nan/NaT from the provided - scalar/ndarray/list-like if we are a NaT, return the correct dtyped - element to provide proper block construction + `val` is going to be inserted as (part of) a new column in a DataFrame + with the given length. If val cannot be made to fit exactly, + find an appropriately-dtyped NA value to construct a complete column from, + which we will later set `val` into. """ if not is_list_like(val): val = [val] + + if is_extension_array_dtype(val): + # We cannot use dtype._na_value bc pd.NA/pd.NaT do not preserve dtype + if len(val) == length: + # TODO: in this case see if we can avoid making a copy later on + return val + if length == 0: + return val[:0].copy() + + dtype = val.dtype + cls = dtype.construct_array_type() + return cls._from_sequence([dtype.na_value], dtype=dtype).repeat(length) + val = np.array(val, copy=False) if needs_i8_conversion(val.dtype): return np.array("NaT", dtype=val.dtype) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index cc7c5f666feda..fd6adcc62d032 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1605,7 +1605,7 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): # We are setting an entire column self.obj[key] = value else: - self.obj[key] = infer_fill_value(value) + self.obj[key] = infer_fill_value(value, len(self.obj)) new_indexer = convert_from_missing_indexer_tuple( indexer, self.obj.axes diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3c27e34dcbcf6..41f8d3064b6b4 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -69,6 +69,8 @@ Categorical, DatetimeArray, ExtensionArray, + FloatingArray, + IntegerArray, PandasArray, TimedeltaArray, ) @@ -623,10 +625,17 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): ) raise TypeError(msg) + values = self.values dtype = pandas_dtype(dtype) + if isinstance(dtype, ExtensionDtype) and self.values.ndim == 2: + # TODO(EA2D): kludge not needed with 2D EAs (astype_nansafe would raise) + # note DataFrame.astype has special handling to avoid getting here + if self.shape[0] != 1: + raise NotImplementedError("Need 2D EAs!") + values = values[0] try: - new_values = self._astype(dtype, copy=copy) + new_values = astype_block_compat(values, dtype, copy=copy) except (ValueError, TypeError): # e.g. astype_nansafe can fail on object-dtype of strings # trying to convert to float @@ -645,25 +654,6 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): ) return newb - def _astype(self, dtype: DtypeObj, copy: bool) -> ArrayLike: - values = self.values - - if is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): - return astype_dt64_to_dt64tz(values, dtype, copy, via_utc=True) - - if is_dtype_equal(values.dtype, dtype): - if copy: - return values.copy() - return values - - if isinstance(values, ExtensionArray): - values = values.astype(dtype, copy=copy) - - else: - values = astype_nansafe(values, dtype, copy=copy) - - return values - def convert( self, copy: bool = True, @@ -908,6 +898,15 @@ def setitem(self, indexer, value): # current dtype cannot store value, coerce to common dtype return self.coerce_to_target_dtype(value).setitem(indexer, value) + value = extract_array(value, extract_numpy=True) + + if isinstance(value, (IntegerArray, FloatingArray)) and not value._mask.any(): + # GH#38896 + value = value.to_numpy(value.dtype.numpy_dtype) + if self.ndim == 2 and value.ndim == 1: + # TODO(EA2D): special case not needed with 2D EAs + value = np.atleast_2d(value).T + if self.dtype.kind in ["m", "M"]: arr = self.array_values().T arr[indexer] = value @@ -1882,6 +1881,11 @@ class NumericBlock(Block): is_numeric = True def _can_hold_element(self, element: Any) -> bool: + if isinstance(element, (IntegerArray, FloatingArray)): + # GH#38896 + if element._mask.any(): + return False + return can_hold_element(self.dtype, element) @property @@ -2500,3 +2504,24 @@ def safe_reshape(arr: ArrayLike, new_shape: Shape) -> ArrayLike: # TODO(EA2D): special case will be unnecessary with 2D EAs arr = np.asarray(arr).reshape(new_shape) return arr + + +def astype_block_compat(values: ArrayLike, dtype: DtypeObj, copy: bool) -> ArrayLike: + """ + Series/DataFrame implementation of .astype + """ + if is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): + return astype_dt64_to_dt64tz(values, dtype, copy, via_utc=True) + + if is_dtype_equal(values.dtype, dtype): + if copy: + return values.copy() + return values + + if isinstance(values, ExtensionArray): + values = values.astype(dtype, copy=copy) + + else: + values = astype_nansafe(values, dtype, copy=copy) + + return values diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index 4ee9cb89fc227..fc0b34e05dce8 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -32,8 +32,11 @@ from pandas.tests.extension import base -def make_data(): - return list(range(1, 9)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100] +def make_data(with_nas: bool = True): + if with_nas: + return list(range(1, 9)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100] + + return list(range(1, 101)) @pytest.fixture( @@ -52,9 +55,10 @@ def dtype(request): return request.param() -@pytest.fixture -def data(dtype): - return pd.array(make_data(), dtype=dtype) +@pytest.fixture(params=[True, False]) +def data(dtype, request): + with_nas = request.param + return pd.array(make_data(with_nas), dtype=dtype) @pytest.fixture @@ -193,7 +197,21 @@ class TestGetitem(base.BaseGetitemTests): class TestSetitem(base.BaseSetitemTests): - pass + def test_setitem_series(self, data, full_indexer): + # https://github.com/pandas-dev/pandas/issues/32395 + # overriden because we have a different `expected` in some cases + ser = expected = pd.Series(data, name="data") + result = pd.Series(index=ser.index, dtype=object, name="data") + + key = full_indexer(ser) + result.loc[key] = ser + + if not data._mask.any(): + # GH#38896 like we do with ndarray, we set the values inplace + # but cast to the new numpy dtype + expected = pd.Series(data.to_numpy(data.dtype.numpy_dtype), name="data") + + self.assert_series_equal(result, expected) class TestMissing(base.BaseMissingTests): diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index a5b54bc153f5d..0f7d4524f9943 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -17,10 +17,12 @@ import pytest from pandas.core.dtypes.dtypes import ExtensionDtype, PandasDtype +from pandas.core.dtypes.missing import infer_fill_value as infer_fill_value_orig import pandas as pd import pandas._testing as tm -from pandas.core.arrays.numpy_ import PandasArray +from pandas.core.arrays import PandasArray, StringArray +from pandas.core.construction import extract_array from pandas.tests.extension import base @@ -29,6 +31,31 @@ def dtype(request): return PandasDtype(np.dtype(request.param)) +orig_setitem = pd.core.internals.Block.setitem + + +def setitem(self, indexer, value): + # patch Block.setitem + value = extract_array(value, extract_numpy=True) + if isinstance(value, PandasArray) and not isinstance(value, StringArray): + value = value.to_numpy() + if self.ndim == 2 and value.ndim == 1: + # TODO(EA2D): special case not needed with 2D EAs + value = np.atleast_2d(value) + + return orig_setitem(self, indexer, value) + + +def infer_fill_value(val, length: int): + # GH#39044 we have to patch core.dtypes.missing.infer_fill_value + # to unwrap PandasArray bc it won't recognize PandasArray with + # is_extension_dtype + if isinstance(val, PandasArray): + val = val.to_numpy() + + return infer_fill_value_orig(val, length) + + @pytest.fixture def allow_in_pandas(monkeypatch): """ @@ -48,6 +75,8 @@ def allow_in_pandas(monkeypatch): """ with monkeypatch.context() as m: m.setattr(PandasArray, "_typ", "extension") + m.setattr(pd.core.indexing, "infer_fill_value", infer_fill_value) + m.setattr(pd.core.internals.Block, "setitem", setitem) yield @@ -457,6 +486,42 @@ def test_setitem_slice(self, data, box_in_series): def test_setitem_loc_iloc_slice(self, data): super().test_setitem_loc_iloc_slice(data) + def test_setitem_with_expansion_dataframe_column(self, data, full_indexer, request): + # https://github.com/pandas-dev/pandas/issues/32395 + df = pd.DataFrame({"data": pd.Series(data)}) + result = pd.DataFrame(index=df.index) + + key = full_indexer(df) + result.loc[key, "data"] = df["data"]._values + + expected = pd.DataFrame({"data": data}) + if data.dtype.numpy_dtype != object: + # For PandasArray we expect to get unboxed to numpy + expected = pd.DataFrame({"data": data.to_numpy()}) + + if isinstance(key, slice) and ( + key == slice(None) and data.dtype.numpy_dtype != object + ): + mark = pytest.mark.xfail( + reason="This case goes through a different code path" + ) + # Other cases go through Block.setitem + request.node.add_marker(mark) + + self.assert_frame_equal(result, expected) + + def test_setitem_series(self, data, full_indexer): + # https://github.com/pandas-dev/pandas/issues/32395 + ser = pd.Series(data, name="data") + result = pd.Series(index=ser.index, dtype=object, name="data") + + key = full_indexer(ser) + result.loc[key] = ser + + # For PandasArray we expect to get unboxed to numpy + expected = pd.Series(data.to_numpy(), name="data") + self.assert_series_equal(result, expected) + @skip_nested class TestParsing(BaseNumPyTests, base.BaseParsingTests): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 1cd352e4e0899..6edc783afb715 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -18,6 +18,7 @@ Index, IndexSlice, MultiIndex, + NaT, Series, SparseDtype, Timedelta, @@ -1348,6 +1349,19 @@ def test_loc_setitem_categorical_column_retains_dtype(self, ordered): expected = DataFrame({"A": [1], "B": Categorical(["b"], ordered=ordered)}) tm.assert_frame_equal(result, expected) + def test_loc_setitem_ea_not_full_column(self): + # GH#39163 + df = DataFrame({"A": range(5)}) + + val = date_range("2016-01-01", periods=3, tz="US/Pacific") + + df.loc[[0, 1, 2], "B"] = val + + bex = val.append(DatetimeIndex([NaT, NaT], dtype=val.dtype)) + expected = DataFrame({"A": range(5), "B": bex}) + assert expected.dtypes["B"] == val.dtype + tm.assert_frame_equal(df, expected) + class TestLocCallable: def test_frame_loc_getitem_callable(self): diff --git a/pandas/util/_exceptions.py b/pandas/util/_exceptions.py index 5ca96a1f9989f..9fdfc30ef7e76 100644 --- a/pandas/util/_exceptions.py +++ b/pandas/util/_exceptions.py @@ -31,7 +31,7 @@ def find_stack_level() -> int: if stack[n].function == "astype": break - while stack[n].function in ["astype", "apply", "_astype"]: + while stack[n].function in ["astype", "apply", "astype_block_compat"]: # e.g. # bump up Block.astype -> BlockManager.astype -> NDFrame.astype # bump up Datetime.Array.astype -> DatetimeIndex.astype