diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index db5cce8459ca2..c0b11b6250056 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -626,6 +626,7 @@ Indexing - Bug in :meth:`Series.reset_index` not ignoring ``name`` argument when ``drop`` and ``inplace`` are set to ``True`` (:issue:`44575`) - Bug in :meth:`DataFrame.loc.__setitem__` and :meth:`DataFrame.iloc.__setitem__` with mixed dtypes sometimes failing to operate in-place (:issue:`44345`) - Bug in :meth:`DataFrame.loc.__getitem__` incorrectly raising ``KeyError`` when selecting a single column with a boolean key (:issue:`44322`). +- Bug in setting :meth:`DataFrame.iloc` with a single ``ExtensionDtype`` column and setting 2D values e.g. ``df.iloc[:] = df.values`` incorrectly raising (:issue:`44514`) - Bug in indexing on columns with ``loc`` or ``iloc`` using a slice with a negative step with ``ExtensionDtype`` columns incorrectly raising (:issue:`44551`) - Bug in :meth:`IntervalIndex.get_indexer_non_unique` returning boolean mask instead of array of integers for a non unique and non monotonic index (:issue:`44084`) - Bug in :meth:`IntervalIndex.get_indexer_non_unique` not handling targets of ``dtype`` 'object' with NaNs correctly (:issue:`44482`) @@ -737,6 +738,7 @@ ExtensionArray - Bug in :func:`array` failing to preserve :class:`PandasArray` (:issue:`43887`) - NumPy ufuncs ``np.abs``, ``np.positive``, ``np.negative`` now correctly preserve dtype when called on ExtensionArrays that implement ``__abs__, __pos__, __neg__``, respectively. In particular this is fixed for :class:`TimedeltaArray` (:issue:`43899`) - Avoid raising ``PerformanceWarning`` about fragmented DataFrame when using many columns with an extension dtype (:issue:`44098`) +- Bug in :class:`IntegerArray` and :class:`FloatingArray` construction incorrectly coercing mismatched NA values (e.g. ``np.timedelta64("NaT")``) to numeric NA (:issue:`44514`) - Bug in :meth:`BooleanArray.__eq__` and :meth:`BooleanArray.__ne__` raising ``TypeError`` on comparison with an incompatible type (like a string). This caused :meth:`DataFrame.replace` to sometimes raise a ``TypeError`` if a nullable boolean column was included (:issue:`44499`) - diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index cd04f4f6e4b3a..585b535775397 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -248,6 +248,36 @@ cdef bint checknull_with_nat_and_na(object obj): return checknull_with_nat(obj) or obj is C_NA +@cython.wraparound(False) +@cython.boundscheck(False) +def is_numeric_na(values: ndarray) -> ndarray: + """ + Check for NA values consistent with IntegerArray/FloatingArray. + + Similar to a vectorized is_valid_na_for_dtype restricted to numeric dtypes. + + Returns + ------- + ndarray[bool] + """ + cdef: + ndarray[uint8_t] result + Py_ssize_t i, N + object val + + N = len(values) + result = np.zeros(N, dtype=np.uint8) + + for i in range(N): + val = values[i] + if checknull(val): + if val is None or val is C_NA or util.is_nan(val) or is_decimal_na(val): + result[i] = True + else: + raise TypeError(f"'values' contains non-numeric NA {val}") + return result.view(bool) + + # ----------------------------------------------------------------------------- # Implementation of NA singleton diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 1e7f1aff52d2e..5e55715ee0e97 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -4,7 +4,10 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import ( + lib, + missing as libmissing, +) from pandas._typing import ( ArrayLike, AstypeArg, @@ -27,7 +30,6 @@ ExtensionDtype, register_extension_dtype, ) -from pandas.core.dtypes.missing import isna from pandas.core.arrays import ExtensionArray from pandas.core.arrays.numeric import ( @@ -129,8 +131,7 @@ def coerce_to_array( if is_object_dtype(values): inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == "empty": - values = np.empty(len(values)) - values.fill(np.nan) + pass elif inferred_type not in [ "floating", "integer", @@ -146,13 +147,15 @@ def coerce_to_array( elif not (is_integer_dtype(values) or is_float_dtype(values)): raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype") + if values.ndim != 1: + raise TypeError("values must be a 1D list-like") + if mask is None: - mask = isna(values) + mask = libmissing.is_numeric_na(values) + else: assert len(mask) == len(values) - if not values.ndim == 1: - raise TypeError("values must be a 1D list-like") if not mask.ndim == 1: raise TypeError("mask must be a 1D list-like") diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 12bef068ef44b..0e82ef731bb63 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -7,6 +7,7 @@ from pandas._libs import ( iNaT, lib, + missing as libmissing, ) from pandas._typing import ( ArrayLike, @@ -32,7 +33,6 @@ is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.missing import isna from pandas.core.arrays import ExtensionArray from pandas.core.arrays.masked import BaseMaskedDtype @@ -183,8 +183,7 @@ def coerce_to_array( if is_object_dtype(values) or is_string_dtype(values): inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == "empty": - values = np.empty(len(values)) - values.fill(np.nan) + pass elif inferred_type not in [ "floating", "integer", @@ -202,13 +201,14 @@ def coerce_to_array( elif not (is_integer_dtype(values) or is_float_dtype(values)): raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype") + if values.ndim != 1: + raise TypeError("values must be a 1D list-like") + if mask is None: - mask = isna(values) + mask = libmissing.is_numeric_na(values) else: assert len(mask) == len(values) - if values.ndim != 1: - raise TypeError("values must be a 1D list-like") if mask.ndim != 1: raise TypeError("mask must be a 1D list-like") diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 550bc4ac56d4b..1ee16879feb1b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1508,6 +1508,17 @@ def setitem(self, indexer, value): # we are always 1-D indexer = indexer[0] + # TODO(EA2D): not needed with 2D EAS + if isinstance(value, (np.ndarray, ExtensionArray)) and value.ndim == 2: + assert value.shape[1] == 1 + # error: No overload variant of "__getitem__" of "ExtensionArray" + # matches argument type "Tuple[slice, int]" + value = value[:, 0] # type: ignore[call-overload] + elif isinstance(value, ABCDataFrame): + # TODO: should we avoid getting here with DataFrame? + assert value.shape[1] == 1 + value = value._ixs(0, axis=1)._values + check_setitem_lengths(indexer, value, self.values) self.values[indexer] = value return self diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py index 4ce3dd35b538b..4b7b237d2eb7c 100644 --- a/pandas/tests/arrays/floating/test_construction.py +++ b/pandas/tests/arrays/floating/test_construction.py @@ -97,14 +97,18 @@ def test_to_array_mixed_integer_float(): np.array(["foo"]), [[1, 2], [3, 4]], [np.nan, {"a": 1}], + # GH#44514 all-NA case used to get quietly swapped out before checking ndim + np.array([pd.NA] * 6, dtype=object).reshape(3, 2), ], ) def test_to_array_error(values): # error in converting existing arrays to FloatingArray - msg = ( - r"(:?.* cannot be converted to a FloatingDtype)" - r"|(:?values must be a 1D list-like)" - r"|(:?Cannot pass scalar)" + msg = "|".join( + [ + "cannot be converted to a FloatingDtype", + "values must be a 1D list-like", + "Cannot pass scalar", + ] ) with pytest.raises((TypeError, ValueError), match=msg): pd.array(values, dtype="Float64") diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index a2d100db81a2c..208a1a1757be2 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -1,6 +1,13 @@ import numpy as np import pytest +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + IntervalDtype, + PandasDtype, + PeriodDtype, +) + import pandas as pd import pandas._testing as tm from pandas.tests.extension.base.base import BaseExtensionTests @@ -357,6 +364,36 @@ def test_setitem_series(self, data, full_indexer): ) self.assert_series_equal(result, expected) + def test_setitem_frame_2d_values(self, data, request): + # GH#44514 + df = pd.DataFrame({"A": data}) + + # Avoiding using_array_manager fixture + # https://github.com/pandas-dev/pandas/pull/44514#discussion_r754002410 + using_array_manager = isinstance(df._mgr, pd.core.internals.ArrayManager) + if using_array_manager: + if not isinstance( + data.dtype, (PandasDtype, PeriodDtype, IntervalDtype, DatetimeTZDtype) + ): + # These dtypes have non-broken implementations of _can_hold_element + mark = pytest.mark.xfail(reason="Goes through split path, loses dtype") + request.node.add_marker(mark) + + df = pd.DataFrame({"A": data}) + orig = df.copy() + + df.iloc[:] = df + self.assert_frame_equal(df, orig) + + df.iloc[:-1] = df.iloc[:-1] + self.assert_frame_equal(df, orig) + + df.iloc[:] = df.values + self.assert_frame_equal(df, orig) + + df.iloc[:-1] = df.values[:-1] + self.assert_frame_equal(df, orig) + def test_delitem_series(self, data): # GH#40763 ser = pd.Series(data, name="data") diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 942da38dc5a26..b102bcdae57d9 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1217,6 +1217,75 @@ def test_setitem_array_as_cell_value(self): expected = DataFrame({"a": [np.zeros((2,))], "b": [np.zeros((2, 2))]}) tm.assert_frame_equal(df, expected) + # with AM goes through split-path, loses dtype + @td.skip_array_manager_not_yet_implemented + def test_iloc_setitem_nullable_2d_values(self): + df = DataFrame({"A": [1, 2, 3]}, dtype="Int64") + orig = df.copy() + + df.loc[:] = df.values[:, ::-1] + tm.assert_frame_equal(df, orig) + + df.loc[:] = pd.core.arrays.PandasArray(df.values[:, ::-1]) + tm.assert_frame_equal(df, orig) + + df.iloc[:] = df.iloc[:, :] + tm.assert_frame_equal(df, orig) + + @pytest.mark.parametrize( + "null", [pd.NaT, pd.NaT.to_numpy("M8[ns]"), pd.NaT.to_numpy("m8[ns]")] + ) + def test_setting_mismatched_na_into_nullable_fails( + self, null, any_numeric_ea_dtype + ): + # GH#44514 don't cast mismatched nulls to pd.NA + df = DataFrame({"A": [1, 2, 3]}, dtype=any_numeric_ea_dtype) + ser = df["A"] + arr = ser._values + + msg = "|".join( + [ + r"int\(\) argument must be a string, a bytes-like object or a " + "(real )?number, not 'NaTType'", + r"timedelta64\[ns\] cannot be converted to an? (Floating|Integer)Dtype", + r"datetime64\[ns\] cannot be converted to an? (Floating|Integer)Dtype", + "object cannot be converted to a FloatingDtype", + "'values' contains non-numeric NA", + ] + ) + with pytest.raises(TypeError, match=msg): + arr[0] = null + + with pytest.raises(TypeError, match=msg): + arr[:2] = [null, null] + + with pytest.raises(TypeError, match=msg): + ser[0] = null + + with pytest.raises(TypeError, match=msg): + ser[:2] = [null, null] + + with pytest.raises(TypeError, match=msg): + ser.iloc[0] = null + + with pytest.raises(TypeError, match=msg): + ser.iloc[:2] = [null, null] + + with pytest.raises(TypeError, match=msg): + df.iloc[0, 0] = null + + with pytest.raises(TypeError, match=msg): + df.iloc[:2, 0] = [null, null] + + # Multi-Block + df2 = df.copy() + df2["B"] = ser.copy() + with pytest.raises(TypeError, match=msg): + df2.iloc[0, 0] = null + + with pytest.raises(TypeError, match=msg): + df2.iloc[:2, 0] = [null, null] + class TestDataFrameIndexingUInt64: def test_setitem(self, uint64_frame): diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index 247f0d50772ce..bc6d5aeb0a581 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -46,9 +46,14 @@ def test_series_clipping_with_na_values(self, any_numeric_ea_dtype, nulls_fixtur # Ensure that clipping method can handle NA values with out failing # GH#40581 - s = Series([nulls_fixture, 1.0, 3.0], dtype=any_numeric_ea_dtype) - s_clipped_upper = s.clip(upper=2.0) - s_clipped_lower = s.clip(lower=2.0) + if nulls_fixture is pd.NaT: + # constructor will raise, see + # test_constructor_mismatched_null_nullable_dtype + return + + ser = Series([nulls_fixture, 1.0, 3.0], dtype=any_numeric_ea_dtype) + s_clipped_upper = ser.clip(upper=2.0) + s_clipped_lower = ser.clip(lower=2.0) expected_upper = Series([nulls_fixture, 1.0, 2.0], dtype=any_numeric_ea_dtype) expected_lower = Series([nulls_fixture, 2.0, 3.0], dtype=any_numeric_ea_dtype) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 692c040a33ff8..43e4c8364c06c 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -13,7 +13,10 @@ iNaT, lib, ) -from pandas.compat.numpy import np_version_under1p19 +from pandas.compat.numpy import ( + np_version_under1p19, + np_version_under1p20, +) import pandas.util._test_decorators as td from pandas.core.dtypes.common import ( @@ -1817,6 +1820,33 @@ def test_constructor_bool_dtype_missing_values(self): expected = Series(True, index=[0], dtype="bool") tm.assert_series_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:elementwise comparison failed:DeprecationWarning" + ) + @pytest.mark.xfail( + np_version_under1p20, reason="np.array([td64nat, float, float]) raises" + ) + @pytest.mark.parametrize("func", [Series, DataFrame, Index, pd.array]) + def test_constructor_mismatched_null_nullable_dtype( + self, func, any_numeric_ea_dtype + ): + # GH#44514 + msg = "|".join( + [ + "cannot safely cast non-equivalent object", + r"int\(\) argument must be a string, a bytes-like object " + "or a (real )?number", + r"Cannot cast array data from dtype\('O'\) to dtype\('float64'\) " + "according to the rule 'safe'", + "object cannot be converted to a FloatingDtype", + "'values' contains non-numeric NA", + ] + ) + + for null in tm.NP_NAT_OBJECTS + [NaT]: + with pytest.raises(TypeError, match=msg): + func([null, 1.0, 3.0], dtype=any_numeric_ea_dtype) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self):