From 65fe412fae7ec2115b3842709c361657f20e05ed Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 7 May 2020 13:29:34 +0100 Subject: [PATCH 1/5] Revert "CLN: short-circuit case in Block.replace (#27768)" This reverts commit 01f90c187f0eec0e8178371d7c066e600c9e105b. --- pandas/core/internals/blocks.py | 36 ++------------------------------- 1 file changed, 2 insertions(+), 34 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e4dcffae45f67..4550eef0f0a1b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -684,26 +684,8 @@ def replace( # If we cannot replace with own dtype, convert to ObjectBlock and # retry if not self._can_hold_element(to_replace): - if not isinstance(to_replace, list): - if inplace: - return [self] - return [self.copy()] - - to_replace = [x for x in to_replace if self._can_hold_element(x)] - if not len(to_replace): - # GH#28084 avoid costly checks since we can infer - # that there is nothing to replace in this block - if inplace: - return [self] - return [self.copy()] - - if len(to_replace) == 1: - # _can_hold_element checks have reduced this back to the - # scalar case and we can avoid a costly object cast - return self.replace( - to_replace[0], value, inplace=inplace, regex=regex, convert=convert, - ) - + # TODO: we should be able to infer at this point that there is + # nothing to replace # GH 22083, TypeError or ValueError occurred within error handling # causes infinite loop. Cast and retry only if not objectblock. if is_object_dtype(self): @@ -728,27 +710,14 @@ def replace( mask = missing.mask_missing(values, to_replace) - if not mask.any(): - if inplace: - return [self] - return [self.copy()] - try: blocks = self.putmask(mask, value, inplace=inplace) - # Note: it is _not_ the case that self._can_hold_element(value) - # is always true at this point. In particular, that can fail - # for: - # "2u" with bool-dtype, float-dtype - # 0.5 with int64-dtype - # np.nan with int64-dtype except (TypeError, ValueError): # GH 22083, TypeError or ValueError occurred within error handling # causes infinite loop. Cast and retry only if not objectblock. if is_object_dtype(self): raise - assert not self._can_hold_element(value), value - # try again with a compatible block block = self.astype(object) return block.replace( @@ -905,7 +874,6 @@ def putmask( # if we are passed a scalar None, convert it here if not is_list_like(new) and isna(new) and not self.is_object: - # FIXME: make sure we have compatible NA new = self.fill_value if self._can_hold_element(new): From 0cc787f769b8220dd0faf261aa15c62a447d5dce Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 7 May 2020 14:45:41 +0100 Subject: [PATCH 2/5] add test case for 32988 --- pandas/tests/frame/methods/test_replace.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index a9fb686d5bc50..9259eaa1ab191 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1380,3 +1380,15 @@ def test_replace_invalid_to_replace(self): ) with pytest.raises(TypeError, match=msg): df.replace(lambda x: x.strip()) + + @pytest.mark.parametrize( + "dtype", ["float", "float64", "int64", "Int64", "boolean"] + ) + @pytest.mark.parametrize( + "value", [np.nan, pd.NA] + ) + def test_replace_no_replacement_dtypes(self, dtype, value): + # https://github.com/pandas-dev/pandas/issues/32988 + df = pd.DataFrame(np.eye(2), dtype=dtype) + result = df.replace(to_replace=[None, -np.inf, np.inf], value=value) + tm.assert_frame_equal(result, df) From 0753d76c4e6bde96ff87e52377ff012415c9a97e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 7 May 2020 16:28:40 +0100 Subject: [PATCH 3/5] black fixup --- pandas/tests/frame/methods/test_replace.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 9259eaa1ab191..444aa45155dbf 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1381,12 +1381,8 @@ def test_replace_invalid_to_replace(self): with pytest.raises(TypeError, match=msg): df.replace(lambda x: x.strip()) - @pytest.mark.parametrize( - "dtype", ["float", "float64", "int64", "Int64", "boolean"] - ) - @pytest.mark.parametrize( - "value", [np.nan, pd.NA] - ) + @pytest.mark.parametrize("dtype", ["float", "float64", "int64", "Int64", "boolean"]) + @pytest.mark.parametrize("value", [np.nan, pd.NA]) def test_replace_no_replacement_dtypes(self, dtype, value): # https://github.com/pandas-dev/pandas/issues/32988 df = pd.DataFrame(np.eye(2), dtype=dtype) From 665598043222ce678a4857a98aa137784b013d1c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 7 May 2020 16:31:59 +0100 Subject: [PATCH 4/5] add whatsnew --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 9c424f70b1ee0..2da257bd6abc2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -744,6 +744,7 @@ Reshaping - Bug in :meth:`DataFrame.unstack` when MultiIndexed columns and MultiIndexed rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`) - Bug in :func:`concat` was not allowing for concatenation of ``DataFrame`` and ``Series`` with duplicate keys (:issue:`33654`) - Bug in :func:`cut` raised an error when non-unique labels (:issue:`33141`) +- Bug in :meth:`DataFrame.replace` casts columns to ``object`` dtype if items in ``to_replace`` not in values (:issue:`32988`) Sparse From 4f871e010a2906c9f686ca80b2a5da248b392614 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 7 May 2020 18:37:10 +0100 Subject: [PATCH 5/5] partial revert revert --- pandas/core/internals/blocks.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4550eef0f0a1b..69a5d25b59810 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -684,8 +684,26 @@ def replace( # If we cannot replace with own dtype, convert to ObjectBlock and # retry if not self._can_hold_element(to_replace): - # TODO: we should be able to infer at this point that there is - # nothing to replace + if not isinstance(to_replace, list): + if inplace: + return [self] + return [self.copy()] + + to_replace = [x for x in to_replace if self._can_hold_element(x)] + if not len(to_replace): + # GH#28084 avoid costly checks since we can infer + # that there is nothing to replace in this block + if inplace: + return [self] + return [self.copy()] + + if len(to_replace) == 1: + # _can_hold_element checks have reduced this back to the + # scalar case and we can avoid a costly object cast + return self.replace( + to_replace[0], value, inplace=inplace, regex=regex, convert=convert, + ) + # GH 22083, TypeError or ValueError occurred within error handling # causes infinite loop. Cast and retry only if not objectblock. if is_object_dtype(self): @@ -712,12 +730,20 @@ def replace( try: blocks = self.putmask(mask, value, inplace=inplace) + # Note: it is _not_ the case that self._can_hold_element(value) + # is always true at this point. In particular, that can fail + # for: + # "2u" with bool-dtype, float-dtype + # 0.5 with int64-dtype + # np.nan with int64-dtype except (TypeError, ValueError): # GH 22083, TypeError or ValueError occurred within error handling # causes infinite loop. Cast and retry only if not objectblock. if is_object_dtype(self): raise + assert not self._can_hold_element(value), value + # try again with a compatible block block = self.astype(object) return block.replace( @@ -874,6 +900,7 @@ def putmask( # if we are passed a scalar None, convert it here if not is_list_like(new) and isna(new) and not self.is_object: + # FIXME: make sure we have compatible NA new = self.fill_value if self._can_hold_element(new):