From 1774489a67b65f3cdba570071ad97bc3b093c459 Mon Sep 17 00:00:00 2001 From: tirth choksi Date: Wed, 15 May 2024 12:22:55 +0530 Subject: [PATCH 01/21] Add np.uintc to _factorizers in merge.py to fix KeyError when merging DataFrames with uintc columns --- pandas/core/reshape/merge.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e6e84c2135b82..aebdd5b1f5d22 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -124,6 +124,7 @@ np.complex64: libhashtable.Complex64Factorizer, np.complex128: libhashtable.Complex128Factorizer, np.object_: libhashtable.ObjectFactorizer, + np.uintc: libhashtable.UInt32Factorizer, } # See https://github.com/pandas-dev/pandas/issues/52451 From 5372107eccf978e34b14201fb780d8e67b46bee1 Mon Sep 17 00:00:00 2001 From: tirth choksi Date: Wed, 15 May 2024 16:05:32 +0530 Subject: [PATCH 02/21] add np.uintc to _factorizers in merge.py --- pandas/tests/reshape/merge/test_merge.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 5c5c06dea0008..f6708756c6965 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1490,10 +1490,12 @@ def test_different(self, dtype): # categorical cols to object result = merge(left, right, on="A") assert is_object_dtype(result.A.dtype) or is_string_dtype(result.A.dtype) - + @pytest.mark.parametrize( + "d1", [np.int64, np.int32, np.intc, np.int16, np.int8, np.uint8] + ) @pytest.mark.parametrize("d2", [np.int64, np.float64, np.float32, np.float16]) - def test_join_multi_dtypes(self, any_int_numpy_dtype, d2): - dtype1 = np.dtype(any_int_numpy_dtype) + def test_join_multi_dtypes(self, d1, d2): + dtype1 = np.dtype(d1) dtype2 = np.dtype(d2) left = DataFrame( From 3d75d94ee172a8ea1292a7c501117606fc6ecfd5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 May 2024 11:22:38 +0000 Subject: [PATCH 03/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/tests/reshape/merge/test_merge.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f6708756c6965..2370620ebaced 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1490,6 +1490,7 @@ def test_different(self, dtype): # categorical cols to object result = merge(left, right, on="A") assert is_object_dtype(result.A.dtype) or is_string_dtype(result.A.dtype) + @pytest.mark.parametrize( "d1", [np.int64, np.int32, np.intc, np.int16, np.int8, np.uint8] ) From 0f79322e88adac747d7ca9e1bfd280ff70a16e79 Mon Sep 17 00:00:00 2001 From: tirth choksi Date: Wed, 15 May 2024 19:11:22 +0530 Subject: [PATCH 04/21] changes according to review --- doc/source/whatsnew/v3.0.0.rst | 11 +++++++++++ pandas/core/reshape/merge.py | 5 ++++- pandas/tests/reshape/merge/test_merge.py | 2 +- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 083e004fb94fa..a53b64d304957 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -153,6 +153,17 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. +Bug Fixes +--------- + +- Fixed an issue (`#58713 `_) where ``pd.merge`` failed to handle certain combinations of data types correctly. This issue caused unexpected behavior or errors when merging DataFrames with different data types, particularly when using ``np.intc`` or ``np.uintc`` data types. + + To address this issue, comprehensive testing coverage has been added to ensure that ``pd.merge`` behaves consistently across different data type combinations. The tests include parameterized fixtures with various combinations of data types, including ``np.intc`` and ``np.uintc``, to verify correct behavior under different scenarios. + + This fix ensures reliable and consistent behavior of ``pd.merge`` when handling DataFrames with diverse data types, improving the stability and robustness of data merging operations in ``pandas``. + + + .. _whatsnew_300.api_breaking.other: Other API changes diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index aebdd5b1f5d22..f16acc4895d0e 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -124,13 +124,16 @@ np.complex64: libhashtable.Complex64Factorizer, np.complex128: libhashtable.Complex128Factorizer, np.object_: libhashtable.ObjectFactorizer, - np.uintc: libhashtable.UInt32Factorizer, + } # See https://github.com/pandas-dev/pandas/issues/52451 if np.intc is not np.int32: _factorizers[np.intc] = libhashtable.Int64Factorizer +if np.uintc is not np.uint32: + _factorizers[np.uintc] = libhashtable.UInt32Factorizer + _known = (np.ndarray, ExtensionArray, Index, ABCSeries) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index f6708756c6965..778f85273df62 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1491,7 +1491,7 @@ def test_different(self, dtype): result = merge(left, right, on="A") assert is_object_dtype(result.A.dtype) or is_string_dtype(result.A.dtype) @pytest.mark.parametrize( - "d1", [np.int64, np.int32, np.intc, np.int16, np.int8, np.uint8] + "d1", [np.int64, np.int32, np.intc, np.int16, np.int8, np.uint8, np.uintc] ) @pytest.mark.parametrize("d2", [np.int64, np.float64, np.float32, np.float16]) def test_join_multi_dtypes(self, d1, d2): From 1373e0584e357353b3c3573eacadb50d78d8ccf8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 May 2024 13:46:46 +0000 Subject: [PATCH 05/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/core/reshape/merge.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f16acc4895d0e..87594002d4f92 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -124,7 +124,6 @@ np.complex64: libhashtable.Complex64Factorizer, np.complex128: libhashtable.Complex128Factorizer, np.object_: libhashtable.ObjectFactorizer, - } # See https://github.com/pandas-dev/pandas/issues/52451 From 16adf4b812adbdaef5be407c2c95a85d2fe1366c Mon Sep 17 00:00:00 2001 From: tirth choksi Date: Wed, 15 May 2024 20:43:51 +0530 Subject: [PATCH 06/21] final commit --- doc/source/whatsnew/v3.0.0.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a53b64d304957..a708e076ac523 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -162,8 +162,6 @@ Bug Fixes This fix ensures reliable and consistent behavior of ``pd.merge`` when handling DataFrames with diverse data types, improving the stability and robustness of data merging operations in ``pandas``. - - .. _whatsnew_300.api_breaking.other: Other API changes From 523efa4995edb8e7af68274bbfbb591fd7ea496e Mon Sep 17 00:00:00 2001 From: tirth choksi Date: Thu, 16 May 2024 08:56:42 +0530 Subject: [PATCH 07/21] final commit --- doc/source/whatsnew/v3.0.0.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a708e076ac523..af486570b16d7 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -113,6 +113,15 @@ These improvements also fixed certain bugs in groupby: - :meth:`.DataFrameGroupBy.sum` would have incorrect values when there are multiple groupings, unobserved groups, and non-numeric data (:issue:`43891`) - :meth:`.DataFrameGroupBy.value_counts` would produce incorrect results when used with some categorical and some non-categorical groupings and ``observed=False`` (:issue:`56016`) +Bug Fixes +--------- + +- Fixed an issue (`#58713 `_) where ``pd.merge`` failed to handle certain combinations of data types correctly. This issue caused unexpected behavior or errors when merging DataFrames with different data types, particularly when using ``np.intc`` or ``np.uintc`` data types. + + To address this issue, comprehensive testing coverage has been added to ensure that ``pd.merge`` behaves consistently across different data type combinations. The tests include parameterized fixtures with various combinations of data types, including ``np.intc`` and ``np.uintc``, to verify correct behavior under different scenarios. + + This fix ensures reliable and consistent behavior of ``pd.merge`` when handling DataFrames with diverse data types, improving the stability and robustness of data merging operations in ``pandas``. + .. _whatsnew_300.notable_bug_fixes.notable_bug_fix2: notable_bug_fix2 @@ -153,15 +162,6 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. -Bug Fixes ---------- - -- Fixed an issue (`#58713 `_) where ``pd.merge`` failed to handle certain combinations of data types correctly. This issue caused unexpected behavior or errors when merging DataFrames with different data types, particularly when using ``np.intc`` or ``np.uintc`` data types. - - To address this issue, comprehensive testing coverage has been added to ensure that ``pd.merge`` behaves consistently across different data type combinations. The tests include parameterized fixtures with various combinations of data types, including ``np.intc`` and ``np.uintc``, to verify correct behavior under different scenarios. - - This fix ensures reliable and consistent behavior of ``pd.merge`` when handling DataFrames with diverse data types, improving the stability and robustness of data merging operations in ``pandas``. - .. _whatsnew_300.api_breaking.other: Other API changes From 67297bd3e1b492c52df92986db1bd0c04ea02e00 Mon Sep 17 00:00:00 2001 From: tirth choksi Date: Thu, 16 May 2024 11:04:54 +0530 Subject: [PATCH 08/21] doc commit --- doc/source/whatsnew/v3.0.0.rst | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index af486570b16d7..d17248cbb5534 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -113,20 +113,16 @@ These improvements also fixed certain bugs in groupby: - :meth:`.DataFrameGroupBy.sum` would have incorrect values when there are multiple groupings, unobserved groups, and non-numeric data (:issue:`43891`) - :meth:`.DataFrameGroupBy.value_counts` would produce incorrect results when used with some categorical and some non-categorical groupings and ``observed=False`` (:issue:`56016`) -Bug Fixes ---------- +.. _whatsnew_300.notable_bug_fixes.notable_bug_fix2: +notable_bug_fix2 +^^^^^^^^^^^^^^^^ - Fixed an issue (`#58713 `_) where ``pd.merge`` failed to handle certain combinations of data types correctly. This issue caused unexpected behavior or errors when merging DataFrames with different data types, particularly when using ``np.intc`` or ``np.uintc`` data types. To address this issue, comprehensive testing coverage has been added to ensure that ``pd.merge`` behaves consistently across different data type combinations. The tests include parameterized fixtures with various combinations of data types, including ``np.intc`` and ``np.uintc``, to verify correct behavior under different scenarios. This fix ensures reliable and consistent behavior of ``pd.merge`` when handling DataFrames with diverse data types, improving the stability and robustness of data merging operations in ``pandas``. -.. _whatsnew_300.notable_bug_fixes.notable_bug_fix2: - -notable_bug_fix2 -^^^^^^^^^^^^^^^^ - .. --------------------------------------------------------------------------- .. _whatsnew_300.api_breaking: From c05255ab8f6e281c46103a00e7ec923f0d740215 Mon Sep 17 00:00:00 2001 From: tirth choksi Date: Thu, 16 May 2024 22:58:40 +0530 Subject: [PATCH 09/21] final commit --- doc/source/whatsnew/v3.0.0.rst | 6 +----- pandas/core/reshape/merge.py | 2 +- pandas/tests/reshape/merge/test_merge.py | 16 ++++++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3de5bd68ed467..fa8507abcff62 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -117,11 +117,6 @@ These improvements also fixed certain bugs in groupby: notable_bug_fix2 ^^^^^^^^^^^^^^^^ -- Fixed an issue (`#58713 `_) where ``pd.merge`` failed to handle certain combinations of data types correctly. This issue caused unexpected behavior or errors when merging DataFrames with different data types, particularly when using ``np.intc`` or ``np.uintc`` data types. - - To address this issue, comprehensive testing coverage has been added to ensure that ``pd.merge`` behaves consistently across different data type combinations. The tests include parameterized fixtures with various combinations of data types, including ``np.intc`` and ``np.uintc``, to verify correct behavior under different scenarios. - - This fix ensures reliable and consistent behavior of ``pd.merge`` when handling DataFrames with diverse data types, improving the stability and robustness of data merging operations in ``pandas``. .. --------------------------------------------------------------------------- .. _whatsnew_300.api_breaking: @@ -478,6 +473,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) +- Fixed issue in `pd.merge` (`#58713`) where merging DataFrames with `np.intc` or `np.uintc` data types caused unexpected behavior or errors. Comprehensive testing now ensures consistent behavior across diverse data type combinations, enhancing stability and robustness of data merging operations. - Sparse diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 87594002d4f92..be921c9e9fd24 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -131,7 +131,7 @@ _factorizers[np.intc] = libhashtable.Int64Factorizer if np.uintc is not np.uint32: - _factorizers[np.uintc] = libhashtable.UInt32Factorizer + _factorizers[np.uintc] = libhashtable.UInt64Factorizer _known = (np.ndarray, ExtensionArray, Index, ABCSeries) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 562dee616bc21..1b32cad632e10 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1491,14 +1491,18 @@ def test_different(self, dtype): result = merge(left, right, on="A") assert is_object_dtype(result.A.dtype) or is_string_dtype(result.A.dtype) - @pytest.mark.parametrize( - "d1", [np.int64, np.int32, np.intc, np.int16, np.int8, np.uint8, np.uintc] - ) @pytest.mark.parametrize("d2", [np.int64, np.float64, np.float32, np.float16]) - def test_join_multi_dtypes(self, d1, d2): - dtype1 = np.dtype(d1) + def test_join_multi_dtypes(self, any_int_numpy_dtype, d2): + dtype1 = np.dtype(any_int_numpy_dtype) dtype2 = np.dtype(d2) - + + # New test implementation for np.uintc here + @pytest.mark.parametrize("d1", [np.uintc]) + @pytest.mark.parametrize("d2", [np.int64, np.float64, np.float32, np.float16]) + def test_join_multi_dtypes_with_uintc(self, d1, d2): + dtype1 = np.dtype(d1) + dtype2 = np.dtype(d2) + left = DataFrame( { "k1": np.array([0, 1, 2] * 8, dtype=dtype1), From 2adb5fc46c6f112e6865e46a36122daae4f16a83 Mon Sep 17 00:00:00 2001 From: tirth choksi Date: Thu, 16 May 2024 23:11:00 +0530 Subject: [PATCH 10/21] final --- pandas/tests/reshape/merge/test_merge.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 1b32cad632e10..1260975f1abce 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1495,14 +1495,7 @@ def test_different(self, dtype): def test_join_multi_dtypes(self, any_int_numpy_dtype, d2): dtype1 = np.dtype(any_int_numpy_dtype) dtype2 = np.dtype(d2) - - # New test implementation for np.uintc here - @pytest.mark.parametrize("d1", [np.uintc]) - @pytest.mark.parametrize("d2", [np.int64, np.float64, np.float32, np.float16]) - def test_join_multi_dtypes_with_uintc(self, d1, d2): - dtype1 = np.dtype(d1) - dtype2 = np.dtype(d2) - + left = DataFrame( { "k1": np.array([0, 1, 2] * 8, dtype=dtype1), @@ -1529,6 +1522,9 @@ def test_join_multi_dtypes_with_uintc(self, d1, d2): result = left.join(right, on=["k1", "k2"], sort=True) expected.sort_values(["k1", "k2"], kind="mergesort", inplace=True) tm.assert_frame_equal(result, expected) + def test_join_multi_dtypes_with_uintc(self): + # Test case specifically for np.uintc as the dtype + self.test_join_multi_dtypes(np.uintc, np.float64) @pytest.mark.parametrize( "int_vals, float_vals, exp_vals", From 88a16186fecb4bbaee8b51dcaf40716dd218d101 Mon Sep 17 00:00:00 2001 From: tirth choksi Date: Thu, 16 May 2024 23:12:38 +0530 Subject: [PATCH 11/21] final --- pandas/tests/reshape/merge/test_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 1260975f1abce..090bc5ca9dc58 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1524,7 +1524,7 @@ def test_join_multi_dtypes(self, any_int_numpy_dtype, d2): tm.assert_frame_equal(result, expected) def test_join_multi_dtypes_with_uintc(self): # Test case specifically for np.uintc as the dtype - self.test_join_multi_dtypes(np.uintc, np.float64) + self.test_join_multi_dtypes(np.uintc, d2) @pytest.mark.parametrize( "int_vals, float_vals, exp_vals", From 90e0b93dcd79503a7a24ff9e3f492e9c9db6e0a2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 16 May 2024 17:47:46 +0000 Subject: [PATCH 12/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/tests/reshape/merge/test_merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 090bc5ca9dc58..66d35f4581d79 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1495,7 +1495,7 @@ def test_different(self, dtype): def test_join_multi_dtypes(self, any_int_numpy_dtype, d2): dtype1 = np.dtype(any_int_numpy_dtype) dtype2 = np.dtype(d2) - + left = DataFrame( { "k1": np.array([0, 1, 2] * 8, dtype=dtype1), @@ -1524,7 +1524,7 @@ def test_join_multi_dtypes(self, any_int_numpy_dtype, d2): tm.assert_frame_equal(result, expected) def test_join_multi_dtypes_with_uintc(self): # Test case specifically for np.uintc as the dtype - self.test_join_multi_dtypes(np.uintc, d2) + self.test_join_multi_dtypes(np.uintc, d2) @pytest.mark.parametrize( "int_vals, float_vals, exp_vals", From 1b9e3d07621b38fda248cfd2e5413c0966c2f4b7 Mon Sep 17 00:00:00 2001 From: tirth choksi Date: Thu, 16 May 2024 23:19:57 +0530 Subject: [PATCH 13/21] indentation change --- pandas/tests/reshape/merge/test_merge.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 66d35f4581d79..5b72b2cfc998e 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1522,6 +1522,7 @@ def test_join_multi_dtypes(self, any_int_numpy_dtype, d2): result = left.join(right, on=["k1", "k2"], sort=True) expected.sort_values(["k1", "k2"], kind="mergesort", inplace=True) tm.assert_frame_equal(result, expected) + def test_join_multi_dtypes_with_uintc(self): # Test case specifically for np.uintc as the dtype self.test_join_multi_dtypes(np.uintc, d2) From 4da0b867a752010b05e9142c1e208d32d6db0ecc Mon Sep 17 00:00:00 2001 From: tirth choksi Date: Thu, 16 May 2024 23:31:40 +0530 Subject: [PATCH 14/21] indentation error solved --- pandas/tests/reshape/merge/test_merge.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 5b72b2cfc998e..7a3795ae132f8 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1522,10 +1522,10 @@ def test_join_multi_dtypes(self, any_int_numpy_dtype, d2): result = left.join(right, on=["k1", "k2"], sort=True) expected.sort_values(["k1", "k2"], kind="mergesort", inplace=True) tm.assert_frame_equal(result, expected) - + def test_join_multi_dtypes_with_uintc(self): - # Test case specifically for np.uintc as the dtype - self.test_join_multi_dtypes(np.uintc, d2) + # Test case specifically for np.uintc as the dtype + self.test_join_multi_dtypes(np.uintc, d2) @pytest.mark.parametrize( "int_vals, float_vals, exp_vals", From 95bca2cea1f76c744653a4e5a06f60dd4f221a18 Mon Sep 17 00:00:00 2001 From: tirth choksi Date: Thu, 16 May 2024 23:39:25 +0530 Subject: [PATCH 15/21] error solved --- pandas/tests/reshape/merge/test_merge.py | 34 +++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 7a3795ae132f8..cdb336a8793cf 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1523,9 +1523,37 @@ def test_join_multi_dtypes(self, any_int_numpy_dtype, d2): expected.sort_values(["k1", "k2"], kind="mergesort", inplace=True) tm.assert_frame_equal(result, expected) - def test_join_multi_dtypes_with_uintc(self): - # Test case specifically for np.uintc as the dtype - self.test_join_multi_dtypes(np.uintc, d2) + @pytest.mark.parametrize("d2", [np.int64, np.float64, np.float32, np.float16]) + def test_join_multi_dtypes_with_uintc(self, np.uintc, d2): + dtype1 = np.dtype(np.uintc) + dtype2 = np.dtype(d2) + + left = DataFrame( + { + "k1": np.array([0, 1, 2] * 8, dtype=dtype1), + "k2": ["foo", "bar"] * 12, + "v": np.array(np.arange(24), dtype=np.int64), + } + ) + + index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")]) + right = DataFrame({"v2": np.array([5, 7], dtype=dtype2)}, index=index) + + result = left.join(right, on=["k1", "k2"]) + + expected = left.copy() + + if dtype2.kind == "i": + dtype2 = np.dtype("float64") + expected["v2"] = np.array(np.nan, dtype=dtype2) + expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5 + expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7 + + tm.assert_frame_equal(result, expected) + + result = left.join(right, on=["k1", "k2"], sort=True) + expected.sort_values(["k1", "k2"], kind="mergesort", inplace=True) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "int_vals, float_vals, exp_vals", From a37151e40d50f990c6ee657f683f41011fa634df Mon Sep 17 00:00:00 2001 From: Tirth Choksi <121241826+Tirthchoksi22@users.noreply.github.com> Date: Thu, 16 May 2024 23:45:53 +0530 Subject: [PATCH 16/21] Update pandas/core/reshape/merge.py Co-authored-by: William Ayd --- pandas/core/reshape/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index be921c9e9fd24..9edf38d66a790 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -130,7 +130,7 @@ if np.intc is not np.int32: _factorizers[np.intc] = libhashtable.Int64Factorizer -if np.uintc is not np.uint32: +if np.uintc(0).itemsize == 8: _factorizers[np.uintc] = libhashtable.UInt64Factorizer _known = (np.ndarray, ExtensionArray, Index, ABCSeries) From c5a3ccc144d286f94b40dd99009fae072075bebf Mon Sep 17 00:00:00 2001 From: tirth choksi Date: Fri, 17 May 2024 00:06:50 +0530 Subject: [PATCH 17/21] update --- pandas/tests/reshape/merge/test_merge.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index cdb336a8793cf..e9c96fc06b484 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1523,10 +1523,9 @@ def test_join_multi_dtypes(self, any_int_numpy_dtype, d2): expected.sort_values(["k1", "k2"], kind="mergesort", inplace=True) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("d2", [np.int64, np.float64, np.float32, np.float16]) - def test_join_multi_dtypes_with_uintc(self, np.uintc, d2): + def test_join_multi_dtypes_with_uintc(self, np.uintc, np.float64): dtype1 = np.dtype(np.uintc) - dtype2 = np.dtype(d2) + dtype2 = np.dtype(np.float64) left = DataFrame( { From 7438297ed13d5079a23830021d49276a5d2527e1 Mon Sep 17 00:00:00 2001 From: tirth choksi Date: Fri, 17 May 2024 00:14:45 +0530 Subject: [PATCH 18/21] update as said --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/tests/reshape/merge/test_merge.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 315815a9be8e4..814a1324b0747 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -474,7 +474,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) -- Fixed issue in `pd.merge` (`#58713`) where merging DataFrames with `np.intc` or `np.uintc` data types caused unexpected behavior or errors. Comprehensive testing now ensures consistent behavior across diverse data type combinations, enhancing stability and robustness of data merging operations. +- Bug in `pd.merge` on Windows where merging DataFrames with `np.intc` or `np.uintc` data types caused unexpected behavior or errors (:issue:`58713`) - Sparse diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index e9c96fc06b484..2c62ec19ec834 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1523,9 +1523,11 @@ def test_join_multi_dtypes(self, any_int_numpy_dtype, d2): expected.sort_values(["k1", "k2"], kind="mergesort", inplace=True) tm.assert_frame_equal(result, expected) - def test_join_multi_dtypes_with_uintc(self, np.uintc, np.float64): - dtype1 = np.dtype(np.uintc) - dtype2 = np.dtype(np.float64) + def test_join_multi_dtypes_with_uintc(self, d1, d2): + d1 = np.uintc + d2 = np.float64 + dtype1 = np.dtype(d1) + dtype2 = np.dtype(d2) left = DataFrame( { From 9105c97ce93fe1ca4392021db3ad8e1ae9f8b207 Mon Sep 17 00:00:00 2001 From: tirth choksi Date: Fri, 17 May 2024 00:24:33 +0530 Subject: [PATCH 19/21] upadte --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 814a1324b0747..1dfca22bea6bb 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -474,7 +474,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) -- Bug in `pd.merge` on Windows where merging DataFrames with `np.intc` or `np.uintc` data types caused unexpected behavior or errors (:issue:`58713`) +- Bug in ``pd.merge`` on Windows where merging DataFrames with ``np.intc`` or ``np.uintc`` data types caused unexpected behavior or errors (:issue:`58713`) - Sparse From 8506f78f2bc2b5d47b99fbcad884336da725b2cb Mon Sep 17 00:00:00 2001 From: tirth choksi Date: Fri, 17 May 2024 00:53:48 +0530 Subject: [PATCH 20/21] update --- pandas/tests/reshape/merge/test_merge.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 2c62ec19ec834..0ac861a3dac79 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1523,9 +1523,12 @@ def test_join_multi_dtypes(self, any_int_numpy_dtype, d2): expected.sort_values(["k1", "k2"], kind="mergesort", inplace=True) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "d1", [np.int64, np.int32, np.intc, np.int16, np.int8, np.uint8, np.uintc] + ) + @pytest.mark.parametrize("d2", [np.int64, np.float64, np.float32, np.float16]) def test_join_multi_dtypes_with_uintc(self, d1, d2): - d1 = np.uintc - d2 = np.float64 + dtype1 = np.dtype(d1) dtype2 = np.dtype(d2) From 621c8d110a4898912290e891a7eaaa4a9d03140d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 16 May 2024 19:28:40 +0000 Subject: [PATCH 21/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/tests/reshape/merge/test_merge.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 0ac861a3dac79..de8f087cab0cf 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1528,7 +1528,6 @@ def test_join_multi_dtypes(self, any_int_numpy_dtype, d2): ) @pytest.mark.parametrize("d2", [np.int64, np.float64, np.float32, np.float16]) def test_join_multi_dtypes_with_uintc(self, d1, d2): - dtype1 = np.dtype(d1) dtype2 = np.dtype(d2)