Skip to content

BUG: IntegerArray/FloatingArray constructors mismatched NAs #44514

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Dec 1, 2021
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,8 @@ Indexing
- Bug in :meth:`Series.__setitem__` with a boolean mask indexer setting a listlike value of length 1 incorrectly broadcasting that value (:issue:`44265`)
- Bug in :meth:`DataFrame.loc.__setitem__` and :meth:`DataFrame.iloc.__setitem__` with mixed dtypes sometimes failing to operate in-place (:issue:`44345`)
- Bug in :meth:`DataFrame.loc.__getitem__` incorrectly raising ``KeyError`` when selecting a single column with a boolean key (:issue:`44322`).
- Bug in setting :meth:`DataFrame.iloc` with a single ``ExtensionDtype`` column and setting 2D values e.g. ``df.iloc[:] = df.values`` incorrectly raising (:issue:`44514`)
-

Missing
^^^^^^^
Expand Down Expand Up @@ -717,6 +719,7 @@ ExtensionArray
- Bug in :func:`array` failing to preserve :class:`PandasArray` (:issue:`43887`)
- NumPy ufuncs ``np.abs``, ``np.positive``, ``np.negative`` now correctly preserve dtype when called on ExtensionArrays that implement ``__abs__, __pos__, __neg__``, respectively. In particular this is fixed for :class:`TimedeltaArray` (:issue:`43899`)
- Avoid raising ``PerformanceWarning`` about fragmented DataFrame when using many columns with an extension dtype (:issue:`44098`)
- Bug in :class:`IntegerArray` and :class:`FloatingArray` construction incorrectly coercing mismatched NA values (e.g. ``np.timedelta64("NaT")``) to numeric NA (:issue:`44514`)
- Bug in :meth:`BooleanArray.__eq__` and :meth:`BooleanArray.__ne__` raising ``TypeError`` on comparison with an incompatible type (like a string). This caused :meth:`DataFrame.replace` to sometimes raise a ``TypeError`` if a nullable boolean column was included (:issue:`44499`)
-

Expand Down
28 changes: 28 additions & 0 deletions pandas/_libs/missing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,34 @@ cdef bint checknull_with_nat_and_na(object obj):
return checknull_with_nat(obj) or obj is C_NA


@cython.wraparound(False)
@cython.boundscheck(False)
def is_numeric_na(values: ndarray) -> ndarray:
"""
Check for NA values consistent with IntegerArray/FloatingArray.

Similar to a vectorized is_valid_na_for_dtype restricted to numeric dtypes.

Returns
-------
ndarray[bool]
"""
cdef:
ndarray[uint8_t] result
Py_ssize_t i, N
object val

N = len(values)
result = np.zeros(N, dtype=np.uint8)

for i in range(N):
val = values[i]
if val is None or val is C_NA or util.is_nan(val) or is_decimal_na(val):
result[i] = True

return result.view(bool)


# -----------------------------------------------------------------------------
# Implementation of NA singleton

Expand Down
15 changes: 10 additions & 5 deletions pandas/core/arrays/floating.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,7 @@ def coerce_to_array(
if is_object_dtype(values):
inferred_type = lib.infer_dtype(values, skipna=True)
if inferred_type == "empty":
values = np.empty(len(values))
values.fill(np.nan)
pass
elif inferred_type not in [
"floating",
"integer",
Expand All @@ -152,13 +151,19 @@ def coerce_to_array(
elif not (is_integer_dtype(values) or is_float_dtype(values)):
raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype")

if values.ndim != 1:
raise TypeError("values must be a 1D list-like")

if mask is None:
mask = isna(values)
mask = libmissing.is_numeric_na(values)
mask2 = isna(values)
if not (mask == mask2).all():
# e.g. if we have a timedelta64("NaT")
raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatively, could it be libmissing.is_numeric_na that already raises on encountering a "non-numeric NA"? (is there a use case for is_numeric_na to not be strict about this, i.e. to get a "partial" mask?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Were you planning to address this one?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

planning to address in a follow-up


else:
assert len(mask) == len(values)

if not values.ndim == 1:
raise TypeError("values must be a 1D list-like")
if not mask.ndim == 1:
raise TypeError("mask must be a 1D list-like")

Expand Down
11 changes: 5 additions & 6 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.missing import isna

from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.masked import (
Expand Down Expand Up @@ -190,8 +189,7 @@ def coerce_to_array(
if is_object_dtype(values) or is_string_dtype(values):
inferred_type = lib.infer_dtype(values, skipna=True)
if inferred_type == "empty":
values = np.empty(len(values))
values.fill(np.nan)
pass
elif inferred_type not in [
"floating",
"integer",
Expand All @@ -209,13 +207,14 @@ def coerce_to_array(
elif not (is_integer_dtype(values) or is_float_dtype(values)):
raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")

if values.ndim != 1:
raise TypeError("values must be a 1D list-like")

if mask is None:
mask = isna(values)
mask = libmissing.is_numeric_na(values)
else:
assert len(mask) == len(values)

if values.ndim != 1:
raise TypeError("values must be a 1D list-like")
if mask.ndim != 1:
raise TypeError("mask must be a 1D list-like")

Expand Down
11 changes: 11 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1508,6 +1508,17 @@ def setitem(self, indexer, value):
# we are always 1-D
indexer = indexer[0]

# TODO(EA2D): not needed with 2D EAS
if isinstance(value, (np.ndarray, ExtensionArray)) and value.ndim == 2:
assert value.shape[1] == 1
# error: No overload variant of "__getitem__" of "ExtensionArray"
# matches argument type "Tuple[slice, int]"
value = value[:, 0] # type: ignore[call-overload]
elif isinstance(value, ABCDataFrame):
# TODO: should we avoid getting here with DataFrame?
assert value.shape[1] == 1
value = value._ixs(0, axis=1)._values

check_setitem_lengths(indexer, value, self.values)
self.values[indexer] = value
return self
Expand Down
12 changes: 8 additions & 4 deletions pandas/tests/arrays/floating/test_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,18 @@ def test_to_array_mixed_integer_float():
np.array(["foo"]),
[[1, 2], [3, 4]],
[np.nan, {"a": 1}],
# GH#44514 all-NA case used to get quietly swapped out before checking ndim
np.array([pd.NA] * 6, dtype=object).reshape(3, 2),
],
)
def test_to_array_error(values):
# error in converting existing arrays to FloatingArray
msg = (
r"(:?.* cannot be converted to a FloatingDtype)"
r"|(:?values must be a 1D list-like)"
r"|(:?Cannot pass scalar)"
msg = "|".join(
[
"cannot be converted to a FloatingDtype",
"values must be a 1D list-like",
"Cannot pass scalar",
]
)
with pytest.raises((TypeError, ValueError), match=msg):
pd.array(values, dtype="Float64")
Expand Down
32 changes: 32 additions & 0 deletions pandas/tests/extension/base/setitem.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
import numpy as np
import pytest

from pandas.core.dtypes.dtypes import (
DatetimeTZDtype,
IntervalDtype,
PandasDtype,
PeriodDtype,
)

import pandas as pd
import pandas._testing as tm
from pandas.tests.extension.base.base import BaseExtensionTests
Expand Down Expand Up @@ -357,6 +364,31 @@ def test_setitem_series(self, data, full_indexer):
)
self.assert_series_equal(result, expected)

def test_setitem_frame_2d_values(self, data, using_array_manager, request):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you move this test out of the extension base tests, or remove the need to use using_array_manager? (this is not defined by external users of those tests, and would be a bit annoying to replicate)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure

# GH#44514
if using_array_manager:
if not isinstance(
data.dtype, (PandasDtype, PeriodDtype, IntervalDtype, DatetimeTZDtype)
):
# These dtypes have non-broken implementations of _can_hold_element
mark = pytest.mark.xfail(reason="Goes through split path, loses dtype")
request.node.add_marker(mark)

df = pd.DataFrame({"A": data})
orig = df.copy()

df.iloc[:] = df
self.assert_frame_equal(df, orig)

df.iloc[:-1] = df.iloc[:-1]
self.assert_frame_equal(df, orig)

df.iloc[:] = df.values
self.assert_frame_equal(df, orig)

df.iloc[:-1] = df.values[:-1]
self.assert_frame_equal(df, orig)

def test_delitem_series(self, data):
# GH#40763
ser = pd.Series(data, name="data")
Expand Down
70 changes: 70 additions & 0 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1217,6 +1217,76 @@ def test_setitem_array_as_cell_value(self):
expected = DataFrame({"a": [np.zeros((2,))], "b": [np.zeros((2, 2))]})
tm.assert_frame_equal(df, expected)

def test_iloc_setitem_nullable_2d_values(self, using_array_manager, request):
if using_array_manager:
mark = pytest.mark.xfail(reason="Goes through split path, loses dtype")
request.node.add_marker(mark)

df = DataFrame({"A": [1, 2, 3]}, dtype="Int64")
orig = df.copy()

df.loc[:] = df.values[:, ::-1]
tm.assert_frame_equal(df, orig)

df.loc[:] = pd.core.arrays.PandasArray(df.values[:, ::-1])
tm.assert_frame_equal(df, orig)

df.iloc[:] = df.iloc[:, :]
tm.assert_frame_equal(df, orig)

@pytest.mark.parametrize(
"null", [pd.NaT, pd.NaT.to_numpy("M8[ns]"), pd.NaT.to_numpy("m8[ns]")]
)
def test_setting_mismatched_na_into_nullable_fails(
self, null, any_numeric_ea_dtype
):
# GH#44514 don't cast mismatched nulls to pd.NA
df = DataFrame({"A": [1, 2, 3]}, dtype=any_numeric_ea_dtype)
ser = df["A"]
arr = ser._values

msg = "|".join(
[
r"int\(\) argument must be a string, a bytes-like object or a "
"(real )?number, not 'NaTType'",
r"timedelta64\[ns\] cannot be converted to an? (Floating|Integer)Dtype",
r"datetime64\[ns\] cannot be converted to an? (Floating|Integer)Dtype",
"object cannot be converted to a FloatingDtype",
]
)
with pytest.raises(TypeError, match=msg):
arr[0] = null

with pytest.raises(TypeError, match=msg):
arr[:2] = [null, null]

with pytest.raises(TypeError, match=msg):
ser[0] = null

with pytest.raises(TypeError, match=msg):
ser[:2] = [null, null]

with pytest.raises(TypeError, match=msg):
ser.iloc[0] = null

with pytest.raises(TypeError, match=msg):
ser.iloc[:2] = [null, null]

with pytest.raises(TypeError, match=msg):
df.iloc[0, 0] = null

with pytest.raises(TypeError, match=msg):
df.iloc[:2, 0] = [null, null]

# Multi-Block
df2 = df.copy()
df2["B"] = ser.copy()
with pytest.raises(TypeError, match=msg):
df2.iloc[0, 0] = null

with pytest.raises(TypeError, match=msg):
df2.iloc[:2, 0] = [null, null]


class TestDataFrameIndexingUInt64:
def test_setitem(self, uint64_frame):
Expand Down
11 changes: 8 additions & 3 deletions pandas/tests/series/methods/test_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,14 @@ def test_series_clipping_with_na_values(self, any_numeric_ea_dtype, nulls_fixtur
# Ensure that clipping method can handle NA values with out failing
# GH#40581

s = Series([nulls_fixture, 1.0, 3.0], dtype=any_numeric_ea_dtype)
s_clipped_upper = s.clip(upper=2.0)
s_clipped_lower = s.clip(lower=2.0)
if nulls_fixture is pd.NaT:
# constructor will raise, see
# test_constructor_mismatched_null_nullable_dtype
return

ser = Series([nulls_fixture, 1.0, 3.0], dtype=any_numeric_ea_dtype)
s_clipped_upper = ser.clip(upper=2.0)
s_clipped_lower = ser.clip(lower=2.0)

expected_upper = Series([nulls_fixture, 1.0, 2.0], dtype=any_numeric_ea_dtype)
expected_lower = Series([nulls_fixture, 2.0, 3.0], dtype=any_numeric_ea_dtype)
Expand Down
31 changes: 30 additions & 1 deletion pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@
iNaT,
lib,
)
from pandas.compat.numpy import np_version_under1p19
from pandas.compat.numpy import (
np_version_under1p19,
np_version_under1p20,
)
import pandas.util._test_decorators as td

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -1817,6 +1820,32 @@ def test_constructor_bool_dtype_missing_values(self):
expected = Series(True, index=[0], dtype="bool")
tm.assert_series_equal(result, expected)

@pytest.mark.filterwarnings(
"ignore:elementwise comparison failed:DeprecationWarning"
)
@pytest.mark.xfail(
np_version_under1p20, reason="np.array([td64nat, float, float]) raises"
)
@pytest.mark.parametrize("func", [Series, DataFrame, Index, pd.array])
def test_constructor_mismatched_null_nullable_dtype(
self, func, any_numeric_ea_dtype
):
# GH#44514
msg = "|".join(
[
"cannot safely cast non-equivalent object",
r"int\(\) argument must be a string, a bytes-like object "
"or a (real )?number",
r"Cannot cast array data from dtype\('O'\) to dtype\('float64'\) "
"according to the rule 'safe'",
"object cannot be converted to a FloatingDtype",
]
)

for null in tm.NP_NAT_OBJECTS + [NaT]:
with pytest.raises(TypeError, match=msg):
func([null, 1.0, 3.0], dtype=any_numeric_ea_dtype)


class TestSeriesConstructorIndexCoercion:
def test_series_constructor_datetimelike_index_coercion(self):
Expand Down