From 93ea131fb9c0631651b3c576dca7c70678ff56ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marco=20Aur=C3=A9lio=20Barbosa?= Date: Mon, 26 Feb 2024 17:20:25 -0300 Subject: [PATCH 01/13] TST: add a test for preserving dtype while calling frame.update (#55509) --- pandas/tests/frame/methods/test_update.py | 29 +++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 788c6220b2477..c6cc906a7893f 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -184,3 +184,32 @@ def test_update_dt_column_with_NaT_create_column(self): {"A": [1.0, 3.0], "B": [pd.NaT, pd.to_datetime("2016-01-01")]} ) tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "value_df, value_other, dtype", + [ + (True, False, bool), + (1, 2, int), + (np.uint64(1), np.uint(2), np.dtype("uint64")), + (1.0, 2.0, float), + (1.0 + 1j, 2.0 + 2j, complex), + ("a", "b", pd.StringDtype()), + ( + pd.to_timedelta("1 ms"), + pd.to_timedelta("2 ms"), + np.dtype("timedelta64[ns]"), + ), + ( + np.datetime64("2000-01-01T00:00:00"), + np.datetime64("2000-01-02T00:00:00"), + np.dtype("datetime64[ns]"), + ), + ], + ) + def test_update_preserve_dtype(self, value_df, value_other, dtype): + # GH#55509 + df = DataFrame({"a": [value_df] * 2}, index=[1, 2]) + other = DataFrame({"a": [value_other]}, index=[1]) + expected = DataFrame({"a": [value_other, value_df]}, index=[1, 2]) + df.update(other) + tm.assert_frame_equal(df, expected) From eb963dfefbf986dc62d5c7c360b78bd1ab061076 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marco=20Aur=C3=A9lio=20Barbosa?= Date: Mon, 26 Feb 2024 17:25:24 -0300 Subject: [PATCH 02/13] TST: Add a test for frame.update raising on duplicate argument indexes (#55509) --- pandas/tests/frame/methods/test_update.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index c6cc906a7893f..2724bfe5f4cf5 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -213,3 +213,10 @@ def test_update_preserve_dtype(self, value_df, value_other, dtype): expected = DataFrame({"a": [value_other, value_df]}, index=[1, 2]) df.update(other) tm.assert_frame_equal(df, expected) + + def test_update_raises_on_duplicate_argument_index(self): + # GH#55509 + df = DataFrame({"a": [1, 1]}, index=[1, 2]) + other = DataFrame({"a": [2, 3]}, index=[1, 1]) + with pytest.raises(ValueError, match="duplicate index"): + df.update(other) From d5a1085b20f5a23ba066e355e774f59e8767c261 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marco=20Aur=C3=A9lio=20Barbosa?= Date: Mon, 26 Feb 2024 17:27:19 -0300 Subject: [PATCH 03/13] TST: frame.update accepts duplicate frame index with unique argument #55509 --- pandas/tests/frame/methods/test_update.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 2724bfe5f4cf5..107642fc88eab 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -220,3 +220,11 @@ def test_update_raises_on_duplicate_argument_index(self): other = DataFrame({"a": [2, 3]}, index=[1, 1]) with pytest.raises(ValueError, match="duplicate index"): df.update(other) + + def test_update_on_duplicate_frame_unique_argument_index(self): + # GH#55509 + df = DataFrame({"a": [1, 1, 1]}, index=[1, 1, 2]) + other = DataFrame({"a": [2, 3]}, index=[1, 2]) + expected = DataFrame({"a": [2, 2, 3]}, index=[1, 1, 2]) + df.update(other) + tm.assert_frame_equal(df, expected) From 02c1b7744d86392323e5bf017601883f29706c01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marco=20Aur=C3=A9lio=20Barbosa?= Date: Mon, 26 Feb 2024 23:26:52 -0300 Subject: [PATCH 04/13] BUG: fix dataframe.update not preserving dtypes (#55509) --- pandas/core/frame.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e5d424b15e69e..984042418782a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8764,11 +8764,22 @@ def update( if not isinstance(other, DataFrame): other = DataFrame(other) - other = other.reindex(self.index) + if other.index.has_duplicates: + raise ValueError("Update not allowed with duplicate indexes on other.") + + rows = other.index.intersection(self.index) + if rows.empty: + raise ValueError( + "Can't update dataframe when other has no index in common with " + "this dataframe." + ) + + other = other.reindex(rows) + this_data = self.loc[rows] for col in self.columns.intersection(other.columns): - this = self[col]._values - that = other[col]._values + this = this_data[col] + that = other[col] if filter_func is not None: mask = ~filter_func(this) | isna(that) @@ -8788,7 +8799,7 @@ def update( if mask.all(): continue - self.loc[:, col] = self[col].where(mask, that) + self.loc[rows, col] = this.where(mask, that) # ---------------------------------------------------------------------- # Data reshaping From 516902242eaadf64a41f6aeaa86c28e05f7994d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marco=20Aur=C3=A9lio=20Barbosa?= Date: Tue, 27 Feb 2024 13:57:13 -0300 Subject: [PATCH 05/13] DOC: Add line indicating bug fix #55509 --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 8bb051b6228ce..04ad6a1563108 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -264,6 +264,7 @@ Bug fixes ~~~~~~~~~ - Fixed bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) - Fixed bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) +- Fixed bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`) - Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) Categorical From d7f63a1e16d5b8fc6602f0ea00cbd31368cee158 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marco=20Aur=C3=A9lio=20Barbosa?= Date: Wed, 28 Feb 2024 10:17:44 -0300 Subject: [PATCH 06/13] DOC: add note on duplicate indices on parameter other (#55509) --- pandas/core/frame.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 984042418782a..5111cdc9c65eb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8692,6 +8692,10 @@ def update( dict.update : Similar method for dictionaries. DataFrame.merge : For column(s)-on-column(s) operations. + Notes + -------- + 1. Duplicate indices on `other` are not supported and raises `ValueError`. + Examples -------- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [400, 500, 600]}) From 1fd2be9ce0269207b4182344240b03ec84b4576b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marco=20Aur=C3=A9lio=20Barbosa?= Date: Thu, 29 Feb 2024 21:12:37 -0300 Subject: [PATCH 07/13] TST: assure test_update_preserve_dtype checks dtype --- pandas/tests/frame/methods/test_update.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 107642fc88eab..16b311273421f 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -190,9 +190,10 @@ def test_update_dt_column_with_NaT_create_column(self): [ (True, False, bool), (1, 2, int), - (np.uint64(1), np.uint(2), np.dtype("uint64")), (1.0, 2.0, float), (1.0 + 1j, 2.0 + 2j, complex), + (np.uint64(1), np.uint(2), np.dtype("ubyte")), + (np.uint64(1), np.uint(2), np.dtype("int_")), ("a", "b", pd.StringDtype()), ( pd.to_timedelta("1 ms"), @@ -208,9 +209,9 @@ def test_update_dt_column_with_NaT_create_column(self): ) def test_update_preserve_dtype(self, value_df, value_other, dtype): # GH#55509 - df = DataFrame({"a": [value_df] * 2}, index=[1, 2]) - other = DataFrame({"a": [value_other]}, index=[1]) - expected = DataFrame({"a": [value_other, value_df]}, index=[1, 2]) + df = DataFrame({"a": [value_df] * 2}, index=[1, 2], dtype=dtype) + other = DataFrame({"a": [value_other]}, index=[1], dtype=dtype) + expected = DataFrame({"a": [value_other, value_df]}, index=[1, 2], dtype=dtype) df.update(other) tm.assert_frame_equal(df, expected) From d948e0a69f7699e43b11b337466ddce97be3adb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marco=20Aur=C3=A9lio=20Barbosa?= Date: Thu, 29 Feb 2024 21:14:21 -0300 Subject: [PATCH 08/13] TST: assure test_update_on_duplicate_frame_unique_argument_index checks dtype --- pandas/tests/frame/methods/test_update.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 16b311273421f..f78690b0086a8 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -224,8 +224,8 @@ def test_update_raises_on_duplicate_argument_index(self): def test_update_on_duplicate_frame_unique_argument_index(self): # GH#55509 - df = DataFrame({"a": [1, 1, 1]}, index=[1, 1, 2]) - other = DataFrame({"a": [2, 3]}, index=[1, 2]) - expected = DataFrame({"a": [2, 2, 3]}, index=[1, 1, 2]) + df = DataFrame({"a": [1, 1, 1]}, index=[1, 1, 2], dtype=np.dtype("int_")) + other = DataFrame({"a": [2, 3]}, index=[1, 2], dtype=np.dtype("int_")) + expected = DataFrame({"a": [2, 2, 3]}, index=[1, 1, 2], dtype=np.dtype("int_")) df.update(other) tm.assert_frame_equal(df, expected) From dbdff5c10d2c31b48bad330aaeb22fd2afc93f84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marco=20Aur=C3=A9lio=20Barbosa?= Date: Thu, 29 Feb 2024 22:00:33 -0300 Subject: [PATCH 09/13] TST: Use np.intc instead of np.int64 on dtype tests for frame.update (#55509) --- pandas/tests/frame/methods/test_update.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index f78690b0086a8..0bcbf7ee700c6 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -193,7 +193,7 @@ def test_update_dt_column_with_NaT_create_column(self): (1.0, 2.0, float), (1.0 + 1j, 2.0 + 2j, complex), (np.uint64(1), np.uint(2), np.dtype("ubyte")), - (np.uint64(1), np.uint(2), np.dtype("int_")), + (np.uint64(1), np.uint(2), np.dtype("intc")), ("a", "b", pd.StringDtype()), ( pd.to_timedelta("1 ms"), @@ -224,8 +224,8 @@ def test_update_raises_on_duplicate_argument_index(self): def test_update_on_duplicate_frame_unique_argument_index(self): # GH#55509 - df = DataFrame({"a": [1, 1, 1]}, index=[1, 1, 2], dtype=np.dtype("int_")) - other = DataFrame({"a": [2, 3]}, index=[1, 2], dtype=np.dtype("int_")) - expected = DataFrame({"a": [2, 2, 3]}, index=[1, 1, 2], dtype=np.dtype("int_")) + df = DataFrame({"a": [1, 1, 1]}, index=[1, 1, 2], dtype=np.dtype("intc")) + other = DataFrame({"a": [2, 3]}, index=[1, 2], dtype=np.dtype("intc")) + expected = DataFrame({"a": [2, 2, 3]}, index=[1, 1, 2], dtype=np.dtype("intc")) df.update(other) tm.assert_frame_equal(df, expected) From 6de3fceea71ceffe151548157f6c6fcb9f5ec8bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marco=20Aur=C3=A9lio=20Barbosa?= Date: Fri, 1 Mar 2024 12:07:12 -0300 Subject: [PATCH 10/13] DOC: Fix separator size (#55509). Minor issue. --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 70960a1ea50a1..54e079da76729 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8707,7 +8707,7 @@ def update( DataFrame.merge : For column(s)-on-column(s) operations. Notes - -------- + ----- 1. Duplicate indices on `other` are not supported and raises `ValueError`. Examples From a62684fb64cfbeb02265b182e23e42b354f84136 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marco=20Aur=C3=A9lio=20Barbosa?= Date: Fri, 1 Mar 2024 12:56:24 -0300 Subject: [PATCH 11/13] DOC: fix error messages on frame.update (PR #57637) --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 54e079da76729..adcc5ec83992a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8788,7 +8788,7 @@ def update( rows = other.index.intersection(self.index) if rows.empty: raise ValueError( - "Can't update dataframe when other has no index in common with " + "Update not allowed when other has no index in common with " "this dataframe." ) From 8a28fb05b06c40300f10d93e8839d2f8d259e8c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marco=20Aur=C3=A9lio=20Barbosa?= Date: Fri, 1 Mar 2024 20:53:16 -0300 Subject: [PATCH 12/13] TST: add test_update_raises_without_intersection on DataFrame (#55509). --- pandas/core/frame.py | 4 ++-- pandas/tests/frame/methods/test_update.py | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index adcc5ec83992a..8c09a321f9bd0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8788,8 +8788,8 @@ def update( rows = other.index.intersection(self.index) if rows.empty: raise ValueError( - "Update not allowed when other has no index in common with " - "this dataframe." + "Update not allowed when the index on `other` has no intersection " + "with this dataframe." ) other = other.reindex(rows) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 0bcbf7ee700c6..269b9e372bd70 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -222,6 +222,13 @@ def test_update_raises_on_duplicate_argument_index(self): with pytest.raises(ValueError, match="duplicate index"): df.update(other) + def test_update_raises_without_intersection(self): + # GH#55509 + df = DataFrame({"a": [1]}, index=[1]) + other = DataFrame({"a": [2]}, index=[2]) + with pytest.raises(ValueError, match="no intersection"): + df.update(other) + def test_update_on_duplicate_frame_unique_argument_index(self): # GH#55509 df = DataFrame({"a": [1, 1, 1]}, index=[1, 1, 2], dtype=np.dtype("intc")) From f08e92e20c67dd68ff15fe3ebb5faee786d240b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marco=20Aur=C3=A9lio=20Barbosa?= Date: Fri, 1 Mar 2024 20:58:17 -0300 Subject: [PATCH 13/13] Rename variable. --- pandas/core/frame.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8c09a321f9bd0..54cefabb6097a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8785,15 +8785,15 @@ def update( if other.index.has_duplicates: raise ValueError("Update not allowed with duplicate indexes on other.") - rows = other.index.intersection(self.index) - if rows.empty: + index_intersection = other.index.intersection(self.index) + if index_intersection.empty: raise ValueError( "Update not allowed when the index on `other` has no intersection " "with this dataframe." ) - other = other.reindex(rows) - this_data = self.loc[rows] + other = other.reindex(index_intersection) + this_data = self.loc[index_intersection] for col in self.columns.intersection(other.columns): this = this_data[col] @@ -8817,7 +8817,7 @@ def update( if mask.all(): continue - self.loc[rows, col] = this.where(mask, that) + self.loc[index_intersection, col] = this.where(mask, that) # ---------------------------------------------------------------------- # Data reshaping