From 524490550b0208561fb82597d2dab73cb153c865 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 11 Jan 2021 17:58:40 -0800 Subject: [PATCH 1/3] BUG: Series.__setitem__ with mismatched IntervalDtype --- pandas/core/arrays/datetimelike.py | 12 ++ pandas/core/arrays/interval.py | 16 ++- pandas/core/indexes/numeric.py | 4 + pandas/core/internals/blocks.py | 45 +++--- pandas/tests/arrays/interval/test_interval.py | 25 ++++ pandas/tests/arrays/test_datetimelike.py | 37 ++++- pandas/tests/internals/test_internals.py | 136 +++++++++++++++++- 7 files changed, 244 insertions(+), 31 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index b31bc0934fe60..ef93e80d83d04 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -613,6 +613,18 @@ def _validate_listlike(self, value, allow_object: bool = False): # We treat empty list as our own dtype. return type(self)._from_sequence([], dtype=self.dtype) + if hasattr(value, "dtype") and value.dtype == object: + # `array` below won't do inference if value is an Index or Series. + # so do so here. in the Index case, inferred_type may be cached. + if lib.infer_dtype(value) in self._infer_matches: + try: + value = type(self)._from_sequence(value) + except (ValueError, TypeError): + if allow_object: + return value + msg = self._validation_error_message(value, True) + raise TypeError(msg) + # Do type inference if necessary up front # e.g. we passed PeriodIndex.values and got an ndarray of Periods value = array(value) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 56550a8665816..ee3a521d68392 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -955,12 +955,22 @@ def _validate_listlike(self, value): # list-like of intervals try: array = IntervalArray(value) - # TODO: self._check_closed_matches(array, name="value") + self._check_closed_matches(array, name="value") value_left, value_right = array.left, array.right except TypeError as err: # wrong type: not interval or NA msg = f"'value' should be an interval type, got {type(value)} instead." raise TypeError(msg) from err + + try: + self.left._validate_fill_value(value_left) + except (ValueError, TypeError) as err: + msg = ( + "'value' should be a compatible interval type, " + f"got {type(value)} instead." + ) + raise TypeError(msg) from err + return value_left, value_right def _validate_scalar(self, value): @@ -995,10 +1005,12 @@ def _validate_setitem_value(self, value): value = np.timedelta64("NaT") value_left, value_right = value, value - elif is_interval_dtype(value) or isinstance(value, Interval): + elif isinstance(value, Interval): # scalar interval self._check_closed_matches(value, name="value") value_left, value_right = value.left, value.right + self.left._validate_fill_value(value_left) + self.left._validate_fill_value(value_right) else: return self._validate_listlike(value) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index e5da085249acf..26599bd6ab871 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -152,6 +152,10 @@ def _validate_fill_value(self, value): raise TypeError value = int(value) + elif hasattr(value, "dtype") and value.dtype.kind in ["m", "M"]: + # TODO: if we're checking arraylike here, do so systematically + raise TypeError + return value def _convert_tolerance(self, tolerance, target): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 9eb4bdc5dbae3..b964a801ff1da 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1,6 +1,6 @@ import inspect import re -from typing import TYPE_CHECKING, Any, List, Optional, Type, Union, cast +from typing import TYPE_CHECKING, Any, Callable, List, Optional, Type, Union, cast import numpy as np @@ -28,7 +28,6 @@ infer_dtype_from_scalar, maybe_downcast_numeric, maybe_downcast_to_dtype, - maybe_infer_dtype_type, maybe_promote, maybe_upcast, soft_convert_objects, @@ -51,7 +50,7 @@ ) from pandas.core.dtypes.dtypes import CategoricalDtype, ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCIndex, ABCPandasArray, ABCSeries -from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna +from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algos from pandas.core.array_algos.putmask import ( @@ -1879,7 +1878,24 @@ def _unstack(self, unstacker, fill_value, new_placement): return blocks, mask -class ObjectValuesExtensionBlock(ExtensionBlock): +class HybridMixin: + """ + Mixin for Blocks backed (maybe indirectly) by ExtensionArrays. + """ + + array_values: Callable + + def _can_hold_element(self, element: Any) -> bool: + values = self.array_values() + + try: + values._validate_setitem_value(element) + return True + except (ValueError, TypeError): + return False + + +class ObjectValuesExtensionBlock(HybridMixin, ExtensionBlock): """ Block providing backwards-compatibility for `.values`. @@ -1890,16 +1906,6 @@ class ObjectValuesExtensionBlock(ExtensionBlock): def external_values(self): return self.values.astype(object) - def _can_hold_element(self, element: Any) -> bool: - if is_valid_nat_for_dtype(element, self.dtype): - return True - if isinstance(element, list) and len(element) == 0: - return True - tipo = maybe_infer_dtype_type(element) - if tipo is not None: - return issubclass(tipo.type, self.dtype.type) - return isinstance(element, self.dtype.type) - class NumericBlock(Block): __slots__ = () @@ -1959,7 +1965,7 @@ class IntBlock(NumericBlock): _can_hold_na = False -class DatetimeLikeBlockMixin(Block): +class DatetimeLikeBlockMixin(HybridMixin, Block): """Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock.""" _can_hold_na = True @@ -2042,15 +2048,6 @@ def where(self, other, cond, errors="raise", axis: int = 0) -> List["Block"]: nb = self.make_block_same_class(res_values) return [nb] - def _can_hold_element(self, element: Any) -> bool: - arr = self.array_values() - - try: - arr._validate_setitem_value(element) - return True - except (TypeError, ValueError): - return False - class DatetimeBlock(DatetimeLikeBlockMixin): __slots__ = () diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index f999f79bc389b..425a0ca070c3c 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -123,6 +123,31 @@ def test_set_na(self, left_right_dtypes): tm.assert_extension_array_equal(result, expected) + def test_setitem_mismatched_closed(self): + arr = IntervalArray.from_breaks(range(4)) + orig = arr.copy() + other = arr.set_closed("both") + + msg = "'value.closed' is 'both', expected 'right'" + with pytest.raises(ValueError, match=msg): + arr[0] = other[0] + with pytest.raises(ValueError, match=msg): + arr[:1] = other[:1] + with pytest.raises(ValueError, match=msg): + arr[:0] = other[:0] + with pytest.raises(ValueError, match=msg): + arr[:] = other[::-1] + with pytest.raises(ValueError, match=msg): + arr[:] = list(other[::-1]) + with pytest.raises(ValueError, match=msg): + arr[:] = other[::-1].astype(object) + with pytest.raises(ValueError, match=msg): + arr[:] = other[::-1].astype("category") + + # empty list should be no-op + arr[:0] = [] + tm.assert_interval_array_equal(arr, orig) + def test_repr(): # GH 25022 diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 81bcff410a4d3..36a52e5802396 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -8,11 +8,9 @@ from pandas.compat.numpy import np_version_under1p18 import pandas as pd +from pandas import DatetimeIndex, Index, Period, PeriodIndex, TimedeltaIndex import pandas._testing as tm -from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray -from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.period import Period, PeriodIndex -from pandas.core.indexes.timedeltas import TimedeltaIndex +from pandas.core.arrays import DatetimeArray, PandasArray, PeriodArray, TimedeltaArray # TODO: more freq variants @@ -402,6 +400,37 @@ def test_setitem(self): expected[:2] = expected[-2:] tm.assert_numpy_array_equal(arr.asi8, expected) + @pytest.mark.parametrize( + "box", + [ + Index, + pd.Series, + np.array, + list, + PandasArray, + ], + ) + def test_setitem_object_dtype(self, box, arr1d): + + expected = arr1d.copy()[::-1] + if expected.dtype.kind in ["m", "M"]: + expected = expected._with_freq(None) + + vals = expected + if box is list: + vals = list(vals) + elif box is np.array: + # if we do np.array(x).astype(object) then dt64 and td64 cast to ints + vals = np.array(vals.astype(object)) + elif box is PandasArray: + vals = box(np.asarray(vals, dtype=object)) + else: + vals = box(vals).astype(object) + + arr1d[:] = vals + + tm.assert_equal(arr1d, expected) + def test_setitem_strs(self, arr1d, request): # Check that we parse strs in both scalar and listlike if isinstance(arr1d, DatetimeArray): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 3c949ad694abd..5ca2935de548d 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -7,8 +7,20 @@ from pandas._libs.internals import BlockPlacement +from pandas.core.dtypes.common import is_scalar + import pandas as pd -from pandas import Categorical, DataFrame, DatetimeIndex, Index, Series +from pandas import ( + Categorical, + DataFrame, + DatetimeIndex, + Index, + IntervalIndex, + Series, + Timedelta, + Timestamp, + period_range, +) import pandas._testing as tm import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray, SparseArray, TimedeltaArray @@ -1074,9 +1086,30 @@ def test_blockplacement_add_int_raises(self, val): class TestCanHoldElement: + @pytest.fixture( + params=[ + lambda x: x, + lambda x: x.to_series(), + lambda x: x._data, + lambda x: list(x), + lambda x: x.astype(object), + lambda x: np.asarray(x), + lambda x: x[0], + lambda x: x[:0], + ] + ) + def element(self, request): + """ + Functions that take an Index and return an element that should have + blk._can_hold_element(element) for a Block with this index's dtype. + """ + return request.param + def test_datetime_block_can_hold_element(self): block = create_block("datetime", [0]) + assert block._can_hold_element([]) + # We will check that block._can_hold_element iff arr.__setitem__ works arr = pd.array(block.values.ravel()) @@ -1101,6 +1134,107 @@ def test_datetime_block_can_hold_element(self): with pytest.raises(TypeError, match=msg): arr[0] = val + @pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64]) + def test_interval_can_hold_element_emptylist(self, dtype, element): + arr = np.array([1, 3, 4], dtype=dtype) + ii = IntervalIndex.from_breaks(arr) + blk = make_block(ii._data, [1], ndim=2) + + assert blk._can_hold_element([]) + # TODO: check this holds for all blocks + + @pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64]) + def test_interval_can_hold_element(self, dtype, element): + arr = np.array([1, 3, 4, 9], dtype=dtype) + ii = IntervalIndex.from_breaks(arr) + blk = make_block(ii._data, [1], ndim=2) + + elem = element(ii) + self.check_series_setitem(elem, ii, True) + assert blk._can_hold_element(elem) + + # Careful: to get the expected Series-inplace behavior we need + # `elem` to not have the same length as `arr` + ii2 = IntervalIndex.from_breaks(arr[:-1], closed="neither") + elem = element(ii2) + self.check_series_setitem(elem, ii, False) + assert not blk._can_hold_element(elem) + + ii3 = IntervalIndex.from_breaks([Timestamp(1), Timestamp(3), Timestamp(4)]) + elem = element(ii3) + self.check_series_setitem(elem, ii, False) + assert not blk._can_hold_element(elem) + + ii4 = IntervalIndex.from_breaks([Timedelta(1), Timedelta(3), Timedelta(4)]) + elem = element(ii4) + self.check_series_setitem(elem, ii, False) + assert not blk._can_hold_element(elem) + + def test_period_can_hold_element_emptylist(self): + pi = period_range("2016", periods=3, freq="A") + blk = make_block(pi._data, [1], ndim=2) + + assert blk._can_hold_element([]) + + def test_period_can_hold_element(self, element): + pi = period_range("2016", periods=3, freq="A") + + elem = element(pi) + self.check_series_setitem(elem, pi, True) + + # Careful: to get the expected Series-inplace behavior we need + # `elem` to not have the same length as `arr` + pi2 = pi.asfreq("D")[:-1] + elem = element(pi2) + self.check_series_setitem(elem, pi, False) + + dti = pi.to_timestamp("S")[:-1] + elem = element(dti) + self.check_series_setitem(elem, pi, False) + + def check_setting(self, elem, index: Index, inplace: bool): + self.check_series_setitem(elem, index, inplace) + self.check_frame_setitem(elem, index, inplace) + + def check_can_hold_element(self, obj, elem, inplace: bool): + blk = obj._mgr.blocks[0] + if inplace: + assert blk._can_hold_element(elem) + else: + assert not blk._can_hold_element(elem) + + def check_series_setitem(self, elem, index: Index, inplace: bool): + arr = index._data.copy() + ser = Series(arr) + + self.check_can_hold_element(ser, elem, inplace) + + if is_scalar(elem): + ser[0] = elem + else: + ser[: len(elem)] = elem + + if inplace: + assert ser.array is arr # i.e. setting was done inplace + else: + assert ser.dtype == object + + def check_frame_setitem(self, elem, index: Index, inplace: bool): + arr = index._data.copy() + df = DataFrame(arr) + + self.check_can_hold_element(df, elem, inplace) + + if is_scalar(elem): + df.iloc[0, 0] = elem + else: + df.iloc[: len(elem), 0] = elem + + if inplace: + assert df._mgr.blocks[0].values is arr # i.e. setting was done inplace + else: + assert df.dtypes[0] == object + class TestShouldStore: def test_should_store_categorical(self): From 4c79151714e0cc33d8760c297339e0dd50f9913b Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 13 Jan 2021 07:50:42 -0800 Subject: [PATCH 2/3] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 170d3eeb1e252..80eb6a23ae1ad 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -254,6 +254,7 @@ Indexing - Bug in :meth:`DataFrame.iloc.__setitem__` and :meth:`DataFrame.loc.__setitem__` with mixed dtypes when setting with a dictionary value (:issue:`38335`) - Bug in :meth:`DataFrame.loc` dropping levels of :class:`MultiIndex` when :class:`DataFrame` used as input has only one row (:issue:`10521`) - Bug in setting ``timedelta64`` values into numeric :class:`Series` failing to cast to object dtype (:issue:`39086`) +- Bug in setting :class:`Interval` values into a :class:`Series` or :class:`DataFrame` with mismatched :class:`IntervalDtype` incorrectly casting the new values to the existing dtype (:issue:`39120`) Missing ^^^^^^^ From b8e0dcfa5bf6dcf77b6b23e41b4ba8fd4cbcca74 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 14 Jan 2021 16:29:18 -0800 Subject: [PATCH 3/3] mypy fixup --- pandas/tests/internals/test_internals.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 5ca2935de548d..f9ba05f7092d4 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1231,7 +1231,11 @@ def check_frame_setitem(self, elem, index: Index, inplace: bool): df.iloc[: len(elem), 0] = elem if inplace: - assert df._mgr.blocks[0].values is arr # i.e. setting was done inplace + # assertion here implies setting was done inplace + + # error: Item "ArrayManager" of "Union[ArrayManager, BlockManager]" + # has no attribute "blocks" [union-attr] + assert df._mgr.blocks[0].values is arr # type:ignore[union-attr] else: assert df.dtypes[0] == object