From f79d0a9d637084881388820e69e9dcf2bdac9c7d Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 23 Mar 2021 21:49:22 -0400 Subject: [PATCH 1/7] REGR: replace with multivalued regex raising --- pandas/core/internals/blocks.py | 12 ++++++++++-- pandas/tests/frame/methods/test_replace.py | 7 +++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7d8dcb34ed582..173fdbc2a4cfa 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -801,10 +801,18 @@ def _replace_list( rb = [self if inplace else self.copy()] for i, (src, dest) in enumerate(pairs): + convert = i == src_len # only convert once at the end new_rb: List[Block] = [] + mask_pos = 0 for blk in rb: - m = masks[i] - convert = i == src_len # only convert once at the end + if blk.ndim == 1: + m = masks[i] + else: + # GH-39338: _replace_coerce can split a block, so we + # need to keep track of where to index into the mask + m = masks[i][mask_pos : mask_pos + blk.shape[0]] + mask_pos += blk.shape[0] + result = blk._replace_coerce( to_replace=src, value=dest, diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 56750da7c90b2..1f950a2473a14 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -654,6 +654,13 @@ def test_regex_replace_numeric_to_object_conversion(self, mix_abc): tm.assert_frame_equal(res, expec) assert res.a.dtype == np.object_ + def test_regex_replace_after_splitting_replace(self): + df = pd.DataFrame({ 'a_str' : ['A1','A2','A3'], +# 'b_int' : ['1,000','200','3'], +# 'c_str' : ['C1','C2','C3'], +# 'd_date' : ['2021-01-01', '', '2021-03-03']}) + + @pytest.mark.parametrize("metachar", ["[]", "()", r"\d", r"\w", r"\s"]) def test_replace_regex_metachar(self, metachar): df = DataFrame({"a": [metachar, "else"]}) From 8678d70713183b1679d05feab1a518c485193911 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 23 Mar 2021 22:06:37 -0400 Subject: [PATCH 2/7] Add whatsnew --- doc/source/whatsnew/v1.2.4.rst | 1 + pandas/tests/frame/methods/test_replace.py | 17 +++++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.2.4.rst b/doc/source/whatsnew/v1.2.4.rst index 45d131327630e..6d24703a6fd46 100644 --- a/doc/source/whatsnew/v1.2.4.rst +++ b/doc/source/whatsnew/v1.2.4.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.sum` when ``min_count`` greater than the :class:`DataFrame` shape was passed resulted in a ``ValueError`` (:issue:`39738`) - Fixed regression in :meth:`DataFrame.to_json` raising ``AttributeError`` when run on PyPy (:issue:`39837`) - Fixed regression in :meth:`DataFrame.where` not returning a copy in the case of an all True condition (:issue:`39595`) +- Fixed regression in :meth:`DataFrame.replace` raising ``IndexError`` with ``regex`` was a multi-key dictionary (:issue:`39338`) - .. --------------------------------------------------------------------------- diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 1f950a2473a14..58c458aed7e07 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -654,12 +654,17 @@ def test_regex_replace_numeric_to_object_conversion(self, mix_abc): tm.assert_frame_equal(res, expec) assert res.a.dtype == np.object_ - def test_regex_replace_after_splitting_replace(self): - df = pd.DataFrame({ 'a_str' : ['A1','A2','A3'], -# 'b_int' : ['1,000','200','3'], -# 'c_str' : ['C1','C2','C3'], -# 'd_date' : ['2021-01-01', '', '2021-03-03']}) - + @pytest.mark.parametrize( + "to_replace", [{"": np.nan, ",": ""}, {",": "", "": np.nan}] + ) + def test_joint_simple_replace_and_regex_replace(self, to_replace): + # GH-39338 + df = pd.DataFrame({"col1": ["1,000", "a", "3"], "col2": ["a", "", "b"]}) + result = df.replace(regex=to_replace) + expected = pd.DataFrame( + {"col1": ["1000", "a", "3"], "col2": ["a", np.nan, "b"]} + ) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("metachar", ["[]", "()", r"\d", r"\w", r"\s"]) def test_replace_regex_metachar(self, metachar): From a10d02150c662d130d18eb71cf3b78aa1ef41dcb Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 23 Mar 2021 22:08:30 -0400 Subject: [PATCH 3/7] Fix typo --- doc/source/whatsnew/v1.2.4.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.4.rst b/doc/source/whatsnew/v1.2.4.rst index 6d24703a6fd46..26d768f830830 100644 --- a/doc/source/whatsnew/v1.2.4.rst +++ b/doc/source/whatsnew/v1.2.4.rst @@ -18,7 +18,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.sum` when ``min_count`` greater than the :class:`DataFrame` shape was passed resulted in a ``ValueError`` (:issue:`39738`) - Fixed regression in :meth:`DataFrame.to_json` raising ``AttributeError`` when run on PyPy (:issue:`39837`) - Fixed regression in :meth:`DataFrame.where` not returning a copy in the case of an all True condition (:issue:`39595`) -- Fixed regression in :meth:`DataFrame.replace` raising ``IndexError`` with ``regex`` was a multi-key dictionary (:issue:`39338`) +- Fixed regression in :meth:`DataFrame.replace` raising ``IndexError`` when ``regex`` was a multi-key dictionary (:issue:`39338`) - .. --------------------------------------------------------------------------- From 17ff456344ff30e6a878567ca32c56bd44dd27b8 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 23 Mar 2021 22:24:04 -0400 Subject: [PATCH 4/7] Precommit fixup --- pandas/core/internals/blocks.py | 5 ++++- pandas/tests/frame/methods/test_replace.py | 6 ++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 173fdbc2a4cfa..3d62928d57a45 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -810,7 +810,10 @@ def _replace_list( else: # GH-39338: _replace_coerce can split a block, so we # need to keep track of where to index into the mask - m = masks[i][mask_pos : mask_pos + blk.shape[0]] + assert not isinstance(masks[i], bool) + # error: Value of type "Union[ExtensionArray, ndarray, bool]" + # is not indexable + m = masks[i][mask_pos : mask_pos + blk.shape[0]] # type: ignore[index] mask_pos += blk.shape[0] result = blk._replace_coerce( diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 58c458aed7e07..dabc46b7cf5db 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -659,11 +659,9 @@ def test_regex_replace_numeric_to_object_conversion(self, mix_abc): ) def test_joint_simple_replace_and_regex_replace(self, to_replace): # GH-39338 - df = pd.DataFrame({"col1": ["1,000", "a", "3"], "col2": ["a", "", "b"]}) + df = DataFrame({"col1": ["1,000", "a", "3"], "col2": ["a", "", "b"]}) result = df.replace(regex=to_replace) - expected = pd.DataFrame( - {"col1": ["1000", "a", "3"], "col2": ["a", np.nan, "b"]} - ) + expected = DataFrame({"col1": ["1000", "a", "3"], "col2": ["a", np.nan, "b"]}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("metachar", ["[]", "()", r"\d", r"\w", r"\s"]) From 2639f2ea6676e51c5fab6f5e0e973c3807fa0a32 Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Tue, 23 Mar 2021 22:28:21 -0400 Subject: [PATCH 5/7] Fix line length --- pandas/core/internals/blocks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3d62928d57a45..d277e5276fa24 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -813,7 +813,9 @@ def _replace_list( assert not isinstance(masks[i], bool) # error: Value of type "Union[ExtensionArray, ndarray, bool]" # is not indexable - m = masks[i][mask_pos : mask_pos + blk.shape[0]] # type: ignore[index] + m = masks[i][ + mask_pos : mask_pos + blk.shape[0] + ] # type: ignore[index] mask_pos += blk.shape[0] result = blk._replace_coerce( From dce9629843da022d24b1dbc72f3bc50aa9611eda Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 24 Mar 2021 19:32:45 -0400 Subject: [PATCH 6/7] Fix ignore --- pandas/core/internals/blocks.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index d277e5276fa24..c07685d6bbc92 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -808,14 +808,11 @@ def _replace_list( if blk.ndim == 1: m = masks[i] else: + mib = masks[i] # GH-39338: _replace_coerce can split a block, so we # need to keep track of where to index into the mask - assert not isinstance(masks[i], bool) - # error: Value of type "Union[ExtensionArray, ndarray, bool]" - # is not indexable - m = masks[i][ - mask_pos : mask_pos + blk.shape[0] - ] # type: ignore[index] + assert not isinstance(mib, bool) + m = mib[mask_pos : mask_pos + blk.shape[0]] mask_pos += blk.shape[0] result = blk._replace_coerce( From f7061933e480a586c047d86910e99e438600dc0d Mon Sep 17 00:00:00 2001 From: Matthew Zeitlin Date: Wed, 24 Mar 2021 20:00:35 -0400 Subject: [PATCH 7/7] Account for single column split --- pandas/core/internals/blocks.py | 14 +++++++------- pandas/tests/frame/methods/test_replace.py | 16 ++++++++++++++-- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c07685d6bbc92..c177618827edb 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -803,17 +803,17 @@ def _replace_list( for i, (src, dest) in enumerate(pairs): convert = i == src_len # only convert once at the end new_rb: List[Block] = [] - mask_pos = 0 - for blk in rb: - if blk.ndim == 1: + + # GH-39338: _replace_coerce can split a block into + # single-column blocks, so track the index so we know + # where to index into the mask + for blk_num, blk in enumerate(rb): + if len(rb) == 1: m = masks[i] else: mib = masks[i] - # GH-39338: _replace_coerce can split a block, so we - # need to keep track of where to index into the mask assert not isinstance(mib, bool) - m = mib[mask_pos : mask_pos + blk.shape[0]] - mask_pos += blk.shape[0] + m = mib[blk_num : blk_num + 1] result = blk._replace_coerce( to_replace=src, diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index dabc46b7cf5db..d8f93f047e74b 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -659,9 +659,21 @@ def test_regex_replace_numeric_to_object_conversion(self, mix_abc): ) def test_joint_simple_replace_and_regex_replace(self, to_replace): # GH-39338 - df = DataFrame({"col1": ["1,000", "a", "3"], "col2": ["a", "", "b"]}) + df = DataFrame( + { + "col1": ["1,000", "a", "3"], + "col2": ["a", "", "b"], + "col3": ["a", "b", "c"], + } + ) result = df.replace(regex=to_replace) - expected = DataFrame({"col1": ["1000", "a", "3"], "col2": ["a", np.nan, "b"]}) + expected = DataFrame( + { + "col1": ["1000", "a", "3"], + "col2": ["a", np.nan, "b"], + "col3": ["a", "b", "c"], + } + ) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("metachar", ["[]", "()", r"\d", r"\w", r"\s"])