From 66d70a5795097930bd78a658fc7c04dd442ea239 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 23 Jan 2022 20:29:31 -0800 Subject: [PATCH 1/5] REF: avoid upcast/downcast in Block.where --- pandas/core/dtypes/cast.py | 10 ++++- pandas/core/internals/blocks.py | 45 +++++++---------------- pandas/tests/frame/indexing/test_where.py | 27 ++++++-------- 3 files changed, 34 insertions(+), 48 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 92f3cfdc589ff..0cfe1fda88767 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1961,7 +1961,7 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: # in smaller int dtypes. info = np.iinfo(dtype) if info.min <= element <= info.max: - return element + return dtype.type(element) raise ValueError if tipo is not None: @@ -2017,6 +2017,14 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: if element._hasna: raise ValueError return element + elif tipo.itemsize > dtype.itemsize: + if isinstance(element, np.ndarray): + # e.g. TestDataFrameIndexingWhere::test_where_alignment + casted = element.astype(dtype) + if np.array_equal(casted, element, equal_nan=True): + return casted + raise ValueError + return element if lib.is_integer(element) or lib.is_float(element): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3d4f53530b89c..c32e671dcc50e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -38,8 +38,8 @@ from pandas.core.dtypes.cast import ( can_hold_element, find_result_type, - maybe_downcast_numeric, maybe_downcast_to_dtype, + np_can_hold_element, soft_convert_objects, ) from pandas.core.dtypes.common import ( @@ -1190,13 +1190,19 @@ def where(self, other, cond) -> list[Block]: other = self._standardize_fill_value(other) - if not self._can_hold_element(other): + try: + # try/except here is equivalent to a self._can_hold_element check, + # but this gets us back 'casted' which we will re-use below; + # without using 'casted', expressions.where may do unwanted upcasts. + casted = np_can_hold_element(self.dtype, other) + except (ValueError, TypeError): # we cannot coerce, return a compat dtype block = self.coerce_to_target_dtype(other) blocks = block.where(orig_other, cond) return self._maybe_downcast(blocks, "infer") else: + other = casted alt = setitem_datetimelike_compat(values, icond.sum(), other) if alt is not other: if is_list_like(other) and len(other) < len(values): @@ -1226,38 +1232,13 @@ def where(self, other, cond) -> list[Block]: # Note: expressions.where may upcast. result = expressions.where(~icond, values, other) + # The np_can_hold_element check _should_ ensure that we always + # have result.dtype == self.dtype here. - if self._can_hold_na or self.ndim == 1: - - if transpose: - result = result.T - - return [self.make_block(result)] - - # might need to separate out blocks - cond = ~icond - axis = cond.ndim - 1 - cond = cond.swapaxes(axis, 0) - mask = cond.all(axis=1) - - result_blocks: list[Block] = [] - for m in [mask, ~mask]: - if m.any(): - taken = result.take(m.nonzero()[0], axis=axis) - r = maybe_downcast_numeric(taken, self.dtype) - if r.dtype != taken.dtype: - warnings.warn( - "Downcasting integer-dtype results in .where is " - "deprecated and will change in a future version. " - "To retain the old behavior, explicitly cast the results " - "to the desired dtype.", - FutureWarning, - stacklevel=find_stack_level(), - ) - nb = self.make_block(r.T, placement=self._mgr_locs[m]) - result_blocks.append(nb) + if transpose: + result = result.T - return result_blocks + return [self.make_block(result)] def _unstack( self, diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 399318a6d6118..2acb2df570b9f 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -141,11 +141,7 @@ def _check_align(df, cond, other, check_dtypes=True): # check other is ndarray cond = df > 0 - warn = None - if df is mixed_int_frame: - warn = FutureWarning - with tm.assert_produces_warning(warn, match="Downcasting integer-dtype"): - _check_align(df, cond, (_safe_add(df).values)) + _check_align(df, cond, (_safe_add(df).values)) # integers are upcast, so don't check the dtypes cond = df > 0 @@ -469,44 +465,45 @@ def test_where_axis(self, using_array_manager): # GH 9736 df = DataFrame(np.random.randn(2, 2)) mask = DataFrame([[False, False], [False, False]]) - s = Series([0, 1]) + ser = Series([0, 1]) expected = DataFrame([[0, 0], [1, 1]], dtype="float64") - result = df.where(mask, s, axis="index") + result = df.where(mask, ser, axis="index") tm.assert_frame_equal(result, expected) result = df.copy() - return_value = result.where(mask, s, axis="index", inplace=True) + return_value = result.where(mask, ser, axis="index", inplace=True) assert return_value is None tm.assert_frame_equal(result, expected) expected = DataFrame([[0, 1], [0, 1]], dtype="float64") - result = df.where(mask, s, axis="columns") + result = df.where(mask, ser, axis="columns") tm.assert_frame_equal(result, expected) result = df.copy() - return_value = result.where(mask, s, axis="columns", inplace=True) + return_value = result.where(mask, ser, axis="columns", inplace=True) assert return_value is None tm.assert_frame_equal(result, expected) + def test_where_axis_with_upcast(self, using_array_manager): # Upcast needed df = DataFrame([[1, 2], [3, 4]], dtype="int64") mask = DataFrame([[False, False], [False, False]]) - s = Series([0, np.nan]) + ser = Series([0, np.nan]) expected = DataFrame([[0, 0], [np.nan, np.nan]], dtype="float64") - result = df.where(mask, s, axis="index") + result = df.where(mask, ser, axis="index") tm.assert_frame_equal(result, expected) result = df.copy() - return_value = result.where(mask, s, axis="index", inplace=True) + return_value = result.where(mask, ser, axis="index", inplace=True) assert return_value is None tm.assert_frame_equal(result, expected) warn = FutureWarning if using_array_manager else None expected = DataFrame([[0, np.nan], [0, np.nan]]) with tm.assert_produces_warning(warn, match="Downcasting integer-dtype"): - result = df.where(mask, s, axis="columns") + result = df.where(mask, ser, axis="columns") tm.assert_frame_equal(result, expected) expected = DataFrame( @@ -516,7 +513,7 @@ def test_where_axis(self, using_array_manager): } ) result = df.copy() - return_value = result.where(mask, s, axis="columns", inplace=True) + return_value = result.where(mask, ser, axis="columns", inplace=True) assert return_value is None tm.assert_frame_equal(result, expected) From a59d5d34c866d2e0396b54dfecb04d9b077c866d Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 23 Jan 2022 21:53:10 -0800 Subject: [PATCH 2/5] mypy fixup --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c32e671dcc50e..4cabd3957807d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1194,7 +1194,7 @@ def where(self, other, cond) -> list[Block]: # try/except here is equivalent to a self._can_hold_element check, # but this gets us back 'casted' which we will re-use below; # without using 'casted', expressions.where may do unwanted upcasts. - casted = np_can_hold_element(self.dtype, other) + casted = np_can_hold_element(values.dtype, other) except (ValueError, TypeError): # we cannot coerce, return a compat dtype block = self.coerce_to_target_dtype(other) From 82fcd6421c8a0cbdd9c5b914e9e39385a4ab26e0 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 23 Jan 2022 21:54:53 -0800 Subject: [PATCH 3/5] ArrayManager compat --- pandas/tests/frame/indexing/test_where.py | 6 ++---- pandas/tests/frame/methods/test_clip.py | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 2acb2df570b9f..0a747e3f438d7 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -485,7 +485,7 @@ def test_where_axis(self, using_array_manager): assert return_value is None tm.assert_frame_equal(result, expected) - def test_where_axis_with_upcast(self, using_array_manager): + def test_where_axis_with_upcast(self): # Upcast needed df = DataFrame([[1, 2], [3, 4]], dtype="int64") mask = DataFrame([[False, False], [False, False]]) @@ -500,10 +500,8 @@ def test_where_axis_with_upcast(self, using_array_manager): assert return_value is None tm.assert_frame_equal(result, expected) - warn = FutureWarning if using_array_manager else None expected = DataFrame([[0, np.nan], [0, np.nan]]) - with tm.assert_produces_warning(warn, match="Downcasting integer-dtype"): - result = df.where(mask, ser, axis="columns") + result = df.where(mask, ser, axis="columns") tm.assert_frame_equal(result, expected) expected = DataFrame( diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index e692948c92a26..c851e65a7ad4f 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -136,7 +136,7 @@ def test_clip_against_unordered_columns(self): tm.assert_frame_equal(result_lower, expected_lower) tm.assert_frame_equal(result_lower_upper, expected_lower_upper) - def test_clip_with_na_args(self, float_frame, using_array_manager): + def test_clip_with_na_args(self, float_frame): """Should process np.nan argument as None""" # GH#17276 tm.assert_frame_equal(float_frame.clip(np.nan), float_frame) @@ -151,9 +151,7 @@ def test_clip_with_na_args(self, float_frame, using_array_manager): ) tm.assert_frame_equal(result, expected) - warn = FutureWarning if using_array_manager else None - with tm.assert_produces_warning(warn, match="Downcasting integer-dtype"): - result = df.clip(lower=[4, 5, np.nan], axis=1) + result = df.clip(lower=[4, 5, np.nan], axis=1) expected = DataFrame( {"col_0": [4, 4, 4], "col_1": [5, 5, 6], "col_2": [7, 8, 9]} ) From 04e69889a16b3e3e5def3ba51d714779d1eef3c3 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 25 Jan 2022 15:38:52 -0800 Subject: [PATCH 4/5] compat for np lt 1.19 --- pandas/core/dtypes/cast.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 0cfe1fda88767..02ef46e0d5053 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -39,6 +39,7 @@ DtypeObj, Scalar, ) +from pandas.compat import np_version_under1p20 from pandas.errors import IntCastingNaNError from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg @@ -87,6 +88,7 @@ ) from pandas.core.dtypes.inference import is_list_like from pandas.core.dtypes.missing import ( + array_equivalent, is_valid_na_for_dtype, isna, na_value_for_dtype, @@ -2021,8 +2023,12 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: if isinstance(element, np.ndarray): # e.g. TestDataFrameIndexingWhere::test_where_alignment casted = element.astype(dtype) - if np.array_equal(casted, element, equal_nan=True): - return casted + if np_version_under1p20: + if array_equivalent(casted, element): + return casted + else: + if np.array_equal(casted, element, equal_nan=True): + return casted raise ValueError return element From 68238c4d8ae8a0891876b4d991d670ed710da2d2 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 28 Jan 2022 12:52:07 -0800 Subject: [PATCH 5/5] use array_equivalent --- pandas/core/dtypes/cast.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index f6f03628aff9b..216dd1e65de3a 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -39,7 +39,6 @@ DtypeObj, Scalar, ) -from pandas.compat import np_version_under1p20 from pandas.errors import IntCastingNaNError from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg @@ -2032,12 +2031,9 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: if isinstance(element, np.ndarray): # e.g. TestDataFrameIndexingWhere::test_where_alignment casted = element.astype(dtype) - if np_version_under1p20: - if array_equivalent(casted, element): - return casted - else: - if np.array_equal(casted, element, equal_nan=True): - return casted + # TODO(np>=1.20): we can just use np.array_equal with equal_nan + if array_equivalent(casted, element): + return casted raise ValueError return element