From 0f128c4c1109161ad7c3a9ee9afa1d05fc7399f9 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 25 Jan 2022 15:38:52 -0800 Subject: [PATCH 1/7] compat for np lt 1.19 --- pandas/core/dtypes/cast.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b1d7de0515998..6c27e2dbcc7a9 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -39,6 +39,7 @@ DtypeObj, Scalar, ) +from pandas.compat import np_version_under1p20 from pandas.errors import IntCastingNaNError from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg From e75ff0004779cece149905c9a7f735deb5772147 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 28 Jan 2022 12:52:07 -0800 Subject: [PATCH 2/7] use array_equivalent --- pandas/core/dtypes/cast.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 6c27e2dbcc7a9..b1d7de0515998 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -39,7 +39,6 @@ DtypeObj, Scalar, ) -from pandas.compat import np_version_under1p20 from pandas.errors import IntCastingNaNError from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg From 80bba4cb83a177a0f72de3807852bc15103b5c8d Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Feb 2022 18:12:49 -0800 Subject: [PATCH 3/7] BUG: Series[float32].__setitem__(int_cant_hold_in_int32) not coercing --- pandas/core/dtypes/cast.py | 11 ++- pandas/core/internals/blocks.py | 91 +++++++++----------- pandas/tests/indexing/test_coercion.py | 27 +++++- pandas/tests/series/indexing/test_setitem.py | 37 ++++++++ pandas/tests/series/methods/test_fillna.py | 19 +++- 5 files changed, 128 insertions(+), 57 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b1d7de0515998..96e944ec3d7f4 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -2017,6 +2017,13 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: raise LossySetitemError elif dtype.kind == "f": + if lib.is_integer(element) or lib.is_float(element): + casted = dtype.type(element) + if np.isnan(casted) or casted == element: + return casted + # otherwise e.g. overflow see TestCoercionFloat32 + raise LossySetitemError + if tipo is not None: # TODO: itemsize check? if tipo.kind not in ["f", "i", "u"]: @@ -2028,7 +2035,7 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: if element._hasna: raise LossySetitemError return element - elif tipo.itemsize > dtype.itemsize: + elif tipo.itemsize > dtype.itemsize or tipo.kind != dtype.kind: if isinstance(element, np.ndarray): # e.g. TestDataFrameIndexingWhere::test_where_alignment casted = element.astype(dtype) @@ -2039,8 +2046,6 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: return element - if lib.is_integer(element) or lib.is_float(element): - return element raise LossySetitemError elif dtype.kind == "c": diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5eb59f2eb9a67..47a7dbef965a9 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1019,7 +1019,7 @@ def putmask(self, mask, new) -> list[Block]: res_blocks.extend(rbs) return res_blocks - def where(self, other, cond) -> list[Block]: + def where(self, other, cond, _downcast="infer") -> list[Block]: """ evaluate the block; return result block(s) from the result @@ -1027,6 +1027,8 @@ def where(self, other, cond) -> list[Block]: ---------- other : a ndarray/object cond : np.ndarray[bool], SparseArray[bool], or BooleanArray + _downcast : str or None, default "infer" + Private because we only specify it when calling from fillna. Returns ------- @@ -1066,7 +1068,7 @@ def where(self, other, cond) -> list[Block]: block = self.coerce_to_target_dtype(other) blocks = block.where(orig_other, cond) - return self._maybe_downcast(blocks, "infer") + return self._maybe_downcast(blocks, downcast=_downcast) else: # since _maybe_downcast would split blocks anyway, we @@ -1083,7 +1085,7 @@ def where(self, other, cond) -> list[Block]: oth = other[:, i : i + 1] submask = cond[:, i : i + 1] - rbs = nb.where(oth, submask) + rbs = nb.where(oth, submask, _downcast=_downcast) res_blocks.extend(rbs) return res_blocks @@ -1158,21 +1160,19 @@ def fillna( if limit is not None: mask[mask.cumsum(self.ndim - 1) > limit] = False - if self._can_hold_element(value): - nb = self if inplace else self.copy() - putmask_inplace(nb.values, mask, value) - return nb._maybe_downcast([nb], downcast) - - elif self.ndim == 1 or self.shape[0] == 1: - blk = self.coerce_to_target_dtype(value) - # bc we have already cast, inplace=True may avoid an extra copy - return blk.fillna(value, limit=limit, inplace=True, downcast=None) - + if inplace: + nbs = self.putmask(mask.T, value) else: - # operate column-by-column - return self.split_and_operate( - type(self).fillna, value, limit=limit, inplace=inplace, downcast=None - ) + # without _downcast, we would break + # test_fillna_dtype_conversion_equiv_replace + nbs = self.where(value, ~mask.T, _downcast=False) + + # Note: blk._maybe_downcast vs self._maybe_downcast(nbs) + # makes a difference bc blk may have object dtype, which has + # different behavior in _maybe_downcast. + return extend_blocks( + [blk._maybe_downcast([blk], downcast=downcast) for blk in nbs] + ) def interpolate( self, @@ -1401,7 +1401,8 @@ def setitem(self, indexer, value): else: return self - def where(self, other, cond) -> list[Block]: + def where(self, other, cond, _downcast="infer") -> list[Block]: + # _downcast private bc we only specify it when calling from fillna arr = self.values.T cond = extract_bool_array(cond) @@ -1436,7 +1437,7 @@ def where(self, other, cond) -> list[Block]: # isinstance(values, NDArrayBackedExtensionArray) blk = self.coerce_to_target_dtype(orig_other) nbs = blk.where(orig_other, orig_cond) - return self._maybe_downcast(nbs, "infer") + return self._maybe_downcast(nbs, downcast=_downcast) else: raise @@ -1485,39 +1486,29 @@ def fillna( ) -> list[Block]: # Caller is responsible for validating limit; if int it is strictly positive - try: - new_values = self.values.fillna(value=value, limit=limit) - except (TypeError, ValueError) as err: - _catch_deprecated_value_error(err) - - if is_interval_dtype(self.dtype): - # Discussion about what we want to support in the general - # case GH#39584 - blk = self.coerce_to_target_dtype(value) - return blk.fillna(value, limit, inplace, downcast) - - elif isinstance(self, NDArrayBackedExtensionBlock): - # We support filling a DatetimeTZ with a `value` whose timezone - # is different by coercing to object. - if self.dtype.kind == "m": - # GH#45746 - warnings.warn( - "The behavior of fillna with timedelta64[ns] dtype and " - f"an incompatible value ({type(value)}) is deprecated. " - "In a future version, this will cast to a common dtype " - "(usually object) instead of raising, matching the " - "behavior of other dtypes.", - FutureWarning, - stacklevel=find_stack_level(), - ) - raise - blk = self.coerce_to_target_dtype(value) - return blk.fillna(value, limit, inplace, downcast) - - else: + if self.dtype.kind == "m": + try: + res_values = self.values.fillna(value, limit) + except (ValueError, TypeError): + # GH#45746 + warnings.warn( + "The behavior of fillna with timedelta64[ns] dtype and " + f"an incompatible value ({type(value)}) is deprecated. " + "In a future version, this will cast to a common dtype " + "(usually object) instead of raising, matching the " + "behavior of other dtypes.", + FutureWarning, + stacklevel=find_stack_level(), + ) raise + else: + res_blk = self.make_block(res_values) + return [res_blk] - return [self.make_block_same_class(values=new_values)] + # TODO: since this now dispatches to super, which in turn dispatches + # to putmask, it may *actually* respect 'inplace=True'. If so, add + # tests for this. + return super().fillna(value, limit=limit, inplace=inplace, downcast=downcast) def delete(self, loc) -> Block: # This will be unnecessary if/when __array_function__ is implemented diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index ab9eef218c0da..2c2aaf87aa62d 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -753,9 +753,30 @@ def test_fillna_index_bool(self): def test_fillna_series_timedelta64(self): raise NotImplementedError - @pytest.mark.xfail(reason="Test not implemented") - def test_fillna_series_period(self): - raise NotImplementedError + @pytest.mark.parametrize( + "fill_val", + [ + 1, + 1.1, + 1 + 1j, + True, + pd.Interval(1, 2, closed="left"), + pd.Timestamp("2012-01-01", tz="US/Eastern"), + pd.Timestamp("2012-01-01"), + pd.Timedelta(days=1), + pd.Period("2016-01-01", "W"), + ], + ) + def test_fillna_series_period(self, index_or_series, fill_val): + + pi = pd.period_range("2016-01-01", periods=4, freq="D").insert(1, pd.NaT) + assert isinstance(pi.dtype, pd.PeriodDtype) + obj = index_or_series(pi) + + exp = index_or_series([pi[0], fill_val, pi[2], pi[3], pi[4]], dtype=object) + + fill_dtype = object + self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype) @pytest.mark.xfail(reason="Test not implemented") def test_fillna_index_timedelta64(self): diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 2261dd18baa3e..5322820eb9917 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -1260,6 +1260,43 @@ def obj(self): return Series([1.1, 2.2, 3.3, 4.4]) +@pytest.mark.parametrize( + "val,exp_dtype", + [ + (1, np.float32), + pytest.param( + 1.1, + np.float32, + marks=pytest.mark.xfail( + reason="np.float32(1.1) ends up as 1.100000023841858, so " + "np_can_hold_element raises and we cast to float64", + ), + ), + (1 + 1j, np.complex128), + (True, object), + (np.uint8(2), np.float32), + (np.uint32(2), np.float32), + # float32 cannot hold 2**33-1 exactly + # (closest it can hold is 4294967300.0 which off by 5.0), so + # we cast to float64 + (np.uint32(2**33 - 1), np.float64), + (np.uint64(2), np.float32), + (np.int64(2), np.float32), + ], +) +class TestCoercionFloat32(CoercionTest): + @pytest.fixture + def obj(self): + return Series([1.1, 2.2, 3.3, 4.4], dtype=np.float32) + + def test_slice_key(self, obj, key, expected, val, indexer_sli, is_inplace): + super().test_slice_key(obj, key, expected, val, indexer_sli, is_inplace) + + if type(val) is float: + # the xfail would xpass bc test_slice_key short-circuits + raise AssertionError("xfail not relevant for this test.") + + @pytest.mark.parametrize( "val,exp_dtype", [(Timestamp("2012-01-01"), "datetime64[ns]"), (1, object), ("x", object)], diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index f339497f10029..71a1b01eb9157 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -749,14 +749,31 @@ def test_fillna_categorical_raises(self): @pytest.mark.parametrize("dtype", [float, "float32", "float64"]) @pytest.mark.parametrize("fill_type", tm.ALL_REAL_NUMPY_DTYPES) - def test_fillna_float_casting(self, dtype, fill_type): + @pytest.mark.parametrize("scalar", [True, False]) + def test_fillna_float_casting(self, dtype, fill_type, scalar): # GH-43424 ser = Series([np.nan, 1.2], dtype=dtype) fill_values = Series([2, 2], dtype=fill_type) + if scalar: + fill_values = fill_values.dtype.type(2) + result = ser.fillna(fill_values) expected = Series([2.0, 1.2], dtype=dtype) tm.assert_series_equal(result, expected) + ser = Series([np.nan, 1.2], dtype=dtype) + mask = ser.isna().to_numpy() + ser[mask] = fill_values + tm.assert_series_equal(ser, expected) + + ser = Series([np.nan, 1.2], dtype=dtype) + ser.mask(mask, fill_values, inplace=True) + tm.assert_series_equal(ser, expected) + + ser = Series([np.nan, 1.2], dtype=dtype) + res = ser.where(~mask, fill_values) + tm.assert_series_equal(res, expected) + def test_fillna_f32_upcast_with_dict(self): # GH-43424 ser = Series([np.nan, 1.2], dtype=np.float32) From 1a0d33029985d7ef09c9055a0d9621b0f305d66a Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Feb 2022 19:36:11 -0800 Subject: [PATCH 4/7] revert --- pandas/core/internals/blocks.py | 91 ++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 47a7dbef965a9..5eb59f2eb9a67 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1019,7 +1019,7 @@ def putmask(self, mask, new) -> list[Block]: res_blocks.extend(rbs) return res_blocks - def where(self, other, cond, _downcast="infer") -> list[Block]: + def where(self, other, cond) -> list[Block]: """ evaluate the block; return result block(s) from the result @@ -1027,8 +1027,6 @@ def where(self, other, cond, _downcast="infer") -> list[Block]: ---------- other : a ndarray/object cond : np.ndarray[bool], SparseArray[bool], or BooleanArray - _downcast : str or None, default "infer" - Private because we only specify it when calling from fillna. Returns ------- @@ -1068,7 +1066,7 @@ def where(self, other, cond, _downcast="infer") -> list[Block]: block = self.coerce_to_target_dtype(other) blocks = block.where(orig_other, cond) - return self._maybe_downcast(blocks, downcast=_downcast) + return self._maybe_downcast(blocks, "infer") else: # since _maybe_downcast would split blocks anyway, we @@ -1085,7 +1083,7 @@ def where(self, other, cond, _downcast="infer") -> list[Block]: oth = other[:, i : i + 1] submask = cond[:, i : i + 1] - rbs = nb.where(oth, submask, _downcast=_downcast) + rbs = nb.where(oth, submask) res_blocks.extend(rbs) return res_blocks @@ -1160,19 +1158,21 @@ def fillna( if limit is not None: mask[mask.cumsum(self.ndim - 1) > limit] = False - if inplace: - nbs = self.putmask(mask.T, value) + if self._can_hold_element(value): + nb = self if inplace else self.copy() + putmask_inplace(nb.values, mask, value) + return nb._maybe_downcast([nb], downcast) + + elif self.ndim == 1 or self.shape[0] == 1: + blk = self.coerce_to_target_dtype(value) + # bc we have already cast, inplace=True may avoid an extra copy + return blk.fillna(value, limit=limit, inplace=True, downcast=None) + else: - # without _downcast, we would break - # test_fillna_dtype_conversion_equiv_replace - nbs = self.where(value, ~mask.T, _downcast=False) - - # Note: blk._maybe_downcast vs self._maybe_downcast(nbs) - # makes a difference bc blk may have object dtype, which has - # different behavior in _maybe_downcast. - return extend_blocks( - [blk._maybe_downcast([blk], downcast=downcast) for blk in nbs] - ) + # operate column-by-column + return self.split_and_operate( + type(self).fillna, value, limit=limit, inplace=inplace, downcast=None + ) def interpolate( self, @@ -1401,8 +1401,7 @@ def setitem(self, indexer, value): else: return self - def where(self, other, cond, _downcast="infer") -> list[Block]: - # _downcast private bc we only specify it when calling from fillna + def where(self, other, cond) -> list[Block]: arr = self.values.T cond = extract_bool_array(cond) @@ -1437,7 +1436,7 @@ def where(self, other, cond, _downcast="infer") -> list[Block]: # isinstance(values, NDArrayBackedExtensionArray) blk = self.coerce_to_target_dtype(orig_other) nbs = blk.where(orig_other, orig_cond) - return self._maybe_downcast(nbs, downcast=_downcast) + return self._maybe_downcast(nbs, "infer") else: raise @@ -1486,29 +1485,39 @@ def fillna( ) -> list[Block]: # Caller is responsible for validating limit; if int it is strictly positive - if self.dtype.kind == "m": - try: - res_values = self.values.fillna(value, limit) - except (ValueError, TypeError): - # GH#45746 - warnings.warn( - "The behavior of fillna with timedelta64[ns] dtype and " - f"an incompatible value ({type(value)}) is deprecated. " - "In a future version, this will cast to a common dtype " - "(usually object) instead of raising, matching the " - "behavior of other dtypes.", - FutureWarning, - stacklevel=find_stack_level(), - ) - raise + try: + new_values = self.values.fillna(value=value, limit=limit) + except (TypeError, ValueError) as err: + _catch_deprecated_value_error(err) + + if is_interval_dtype(self.dtype): + # Discussion about what we want to support in the general + # case GH#39584 + blk = self.coerce_to_target_dtype(value) + return blk.fillna(value, limit, inplace, downcast) + + elif isinstance(self, NDArrayBackedExtensionBlock): + # We support filling a DatetimeTZ with a `value` whose timezone + # is different by coercing to object. + if self.dtype.kind == "m": + # GH#45746 + warnings.warn( + "The behavior of fillna with timedelta64[ns] dtype and " + f"an incompatible value ({type(value)}) is deprecated. " + "In a future version, this will cast to a common dtype " + "(usually object) instead of raising, matching the " + "behavior of other dtypes.", + FutureWarning, + stacklevel=find_stack_level(), + ) + raise + blk = self.coerce_to_target_dtype(value) + return blk.fillna(value, limit, inplace, downcast) + else: - res_blk = self.make_block(res_values) - return [res_blk] + raise - # TODO: since this now dispatches to super, which in turn dispatches - # to putmask, it may *actually* respect 'inplace=True'. If so, add - # tests for this. - return super().fillna(value, limit=limit, inplace=inplace, downcast=downcast) + return [self.make_block_same_class(values=new_values)] def delete(self, loc) -> Block: # This will be unnecessary if/when __array_function__ is implemented From 8912ca34759e85815e115d5eb56fff2fa1cbc545 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Feb 2022 19:38:25 -0800 Subject: [PATCH 5/7] fix uint32max --- pandas/tests/series/indexing/test_setitem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 5322820eb9917..48c73e1941ec1 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -1279,7 +1279,7 @@ def obj(self): # float32 cannot hold 2**33-1 exactly # (closest it can hold is 4294967300.0 which off by 5.0), so # we cast to float64 - (np.uint32(2**33 - 1), np.float64), + (np.uint32(np.iinfo(np.uint32).max), np.float64), (np.uint64(2), np.float32), (np.int64(2), np.float32), ], From 474d9c20d0d7f957ace9ddd2830512ed8f1bf2c7 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 5 Feb 2022 19:52:04 -0800 Subject: [PATCH 6/7] typo fixup --- pandas/tests/series/indexing/test_setitem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 48c73e1941ec1..c270584342491 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -1276,7 +1276,7 @@ def obj(self): (True, object), (np.uint8(2), np.float32), (np.uint32(2), np.float32), - # float32 cannot hold 2**33-1 exactly + # float32 cannot hold np.iinfo(np.uint32).max exactly # (closest it can hold is 4294967300.0 which off by 5.0), so # we cast to float64 (np.uint32(np.iinfo(np.uint32).max), np.float64), From 1b8ec10632b4d8f4bdaf6cc48d090138fa4f0990 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 6 Feb 2022 15:00:25 -0800 Subject: [PATCH 7/7] whatsnew --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index b559d8eb463a1..ae1d2244663e3 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -318,6 +318,7 @@ Indexing - Bug in getting a column from a DataFrame with an object-dtype row index with datetime-like values: the resulting Series now preserves the exact object-dtype Index from the parent DataFrame (:issue:`42950`) - Bug in indexing on a :class:`DatetimeIndex` with a ``np.str_`` key incorrectly raising (:issue:`45580`) - Bug in :meth:`CategoricalIndex.get_indexer` when index contains ``NaN`` values, resulting in elements that are in target but not present in the index to be mapped to the index of the NaN element, instead of -1 (:issue:`45361`) +- Bug in setting large integer values into :class:`Series` with ``float32`` or ``float16`` dtype incorrectly altering these values instead of coercing to ``float64`` dtype (:issue:`45844`) - Missing