From 07239bb9b27536e874d3ad1240e39d2e2ee24d49 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 11 Dec 2023 00:11:31 +0100 Subject: [PATCH 1/5] BUG: merge not raising for String and numeric merges --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/reshape/merge.py | 18 +++++++++++------- pandas/tests/reshape/merge/test_join.py | 4 ++-- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index ad44e87cacf82..3b71a61794756 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -664,6 +664,7 @@ Reshaping - Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`) - Bug in :func:`concat` renaming :class:`Series` when ``ignore_index=False`` (:issue:`15047`) - Bug in :func:`merge_asof` raising ``TypeError`` when ``by`` dtype is not ``object``, ``int64``, or ``uint64`` (:issue:`22794`) +- Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) - Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`) - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f07c4fb8f7d5f..cbf3ec70cf353 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1355,8 +1355,12 @@ def _maybe_coerce_merge_keys(self) -> None: lk_is_cat = isinstance(lk.dtype, CategoricalDtype) rk_is_cat = isinstance(rk.dtype, CategoricalDtype) - lk_is_object = is_object_dtype(lk.dtype) - rk_is_object = is_object_dtype(rk.dtype) + lk_is_object_or_string = is_object_dtype(lk.dtype) or is_string_dtype( + lk.dtype + ) + rk_is_object_or_string = is_object_dtype(rk.dtype) or is_string_dtype( + rk.dtype + ) # if either left or right is a categorical # then the must match exactly in categories & ordered @@ -1451,14 +1455,14 @@ def _maybe_coerce_merge_keys(self) -> None: # incompatible dtypes GH 9780, GH 15800 # bool values are coerced to object - elif (lk_is_object and is_bool_dtype(rk.dtype)) or ( - is_bool_dtype(lk.dtype) and rk_is_object + elif (lk_is_object_or_string and is_bool_dtype(rk.dtype)) or ( + is_bool_dtype(lk.dtype) and rk_is_object_or_string ): pass # object values are allowed to be merged - elif (lk_is_object and is_numeric_dtype(rk.dtype)) or ( - is_numeric_dtype(lk.dtype) and rk_is_object + elif (lk_is_object_or_string and is_numeric_dtype(rk.dtype)) or ( + is_numeric_dtype(lk.dtype) and rk_is_object_or_string ): inferred_left = lib.infer_dtype(lk, skipna=False) inferred_right = lib.infer_dtype(rk, skipna=False) @@ -1497,7 +1501,7 @@ def _maybe_coerce_merge_keys(self) -> None: # allows datetime with different resolutions continue - elif lk_is_object and rk_is_object: + elif is_object_dtype(lk.dtype) and is_object_dtype(rk.dtype): continue # Houston, we have a problem! diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 3408e6e4731bd..1203d8a923068 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -150,8 +150,8 @@ def test_join_on(self, target_source): # overlap source_copy = source.copy() msg = ( - "You are trying to merge on float64 and object columns for key 'A'. " - "If you wish to proceed you should use pd.concat" + "You are trying to merge on float64 and object|string columns for key " + "'A'. If you wish to proceed you should use pd.concat" ) with pytest.raises(ValueError, match=msg): target.join(source_copy, on="A") From 6c4930b89f450ef5c0e2c799f877a13d74acb32c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 11 Dec 2023 00:37:33 +0100 Subject: [PATCH 2/5] BUG: merge not sorting for new string dtype --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/reshape/merge.py | 4 +++ pandas/tests/reshape/merge/test_join.py | 45 +++++++++++++++---------- 3 files changed, 32 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index ad44e87cacf82..f418d0b02ce4d 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -664,6 +664,7 @@ Reshaping - Bug in :func:`concat` ignoring ``sort`` parameter when passed :class:`DatetimeIndex` indexes (:issue:`54769`) - Bug in :func:`concat` renaming :class:`Series` when ``ignore_index=False`` (:issue:`15047`) - Bug in :func:`merge_asof` raising ``TypeError`` when ``by`` dtype is not ``object``, ``int64``, or ``uint64`` (:issue:`22794`) +- Bug in :func:`merge` not sorting for new string dtype (:issue:`56442`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) - Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`) - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f07c4fb8f7d5f..3953305492505 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2410,6 +2410,10 @@ def _factorize_keys( ) if dc.null_count > 0: count += 1 + + if sort: + uniques = dc.dictionary.to_numpy(zero_copy_only=False) + llab, rlab = _sort_labels(uniques, llab, rlab) return llab, rlab, count if not isinstance(lk, BaseMaskedArray) and not ( diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 3408e6e4731bd..b3189c7a2d95e 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -14,6 +16,7 @@ bdate_range, concat, merge, + option_context, ) import pandas._testing as tm @@ -557,24 +560,30 @@ def test_join_many_non_unique_index(self): tm.assert_frame_equal(inner, left) tm.assert_frame_equal(inner, right) - def test_join_sort(self): - left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]}) - right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) - - joined = left.join(right, on="key", sort=True) - expected = DataFrame( - { - "key": ["bar", "baz", "foo", "foo"], - "value": [2, 3, 1, 4], - "value2": ["a", "b", "c", "c"], - }, - index=[1, 2, 0, 3], - ) - tm.assert_frame_equal(joined, expected) - - # smoke test - joined = left.join(right, on="key", sort=False) - tm.assert_index_equal(joined.index, Index(range(4)), exact=True) + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_join_sort(self, infer_string): + with option_context("future.infer_string", infer_string): + left = DataFrame( + {"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]} + ) + right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) + + joined = left.join(right, on="key", sort=True) + expected = DataFrame( + { + "key": ["bar", "baz", "foo", "foo"], + "value": [2, 3, 1, 4], + "value2": ["a", "b", "c", "c"], + }, + index=[1, 2, 0, 3], + ) + tm.assert_frame_equal(joined, expected) + + # smoke test + joined = left.join(right, on="key", sort=False) + tm.assert_index_equal(joined.index, Index(range(4)), exact=True) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index From 3e63fb99b2a2d28f1b246319e0b8fc3a6fca9095 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 11 Dec 2023 00:38:58 +0100 Subject: [PATCH 3/5] Add coverage --- pandas/tests/reshape/merge/test_join.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 1203d8a923068..580b0c4d7b7b8 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -118,7 +120,10 @@ def test_handle_overlap_arbitrary_key(self, df, df2): assert "key1.foo" in joined assert "key2.bar" in joined - def test_join_on(self, target_source): + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_join_on(self, target_source, infer_string): target, source = target_source merged = target.join(source, on="C") From dff745ba3a1243a9f58f749b34863ca5222b9d43 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 11 Dec 2023 01:47:10 +0100 Subject: [PATCH 4/5] Fixup tests --- pandas/core/reshape/merge.py | 18 ++++++-- pandas/tests/reshape/merge/test_join.py | 4 +- pandas/tests/reshape/merge/test_merge.py | 46 +++++++++++-------- pandas/tests/reshape/merge/test_merge_asof.py | 15 ++++-- pandas/tests/reshape/merge/test_multi.py | 2 +- 5 files changed, 55 insertions(+), 30 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d0f46a0acfe00..95cbd6148ba28 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2401,23 +2401,31 @@ def _factorize_keys( .combine_chunks() .dictionary_encode() ) - length = len(dc.dictionary) llab, rlab, count = ( - pc.fill_null(dc.indices[slice(len_lk)], length) + pc.fill_null(dc.indices[slice(len_lk)], -1) .to_numpy() .astype(np.intp, copy=False), - pc.fill_null(dc.indices[slice(len_lk, None)], length) + pc.fill_null(dc.indices[slice(len_lk, None)], -1) .to_numpy() .astype(np.intp, copy=False), len(dc.dictionary), ) - if dc.null_count > 0: - count += 1 if sort: uniques = dc.dictionary.to_numpy(zero_copy_only=False) llab, rlab = _sort_labels(uniques, llab, rlab) + + if dc.null_count > 0: + lmask = llab == -1 + lany = lmask.any() + rmask = rlab == -1 + rany = rmask.any() + if lany: + np.putmask(llab, lmask, count) + if rany: + np.putmask(rlab, rmask, count) + count += 1 return llab, rlab, count if not isinstance(lk, BaseMaskedArray) and not ( diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index c8e9091de2df9..4c0a19ea0e33e 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -627,7 +627,7 @@ def test_mixed_type_join_with_suffix(self): df.insert(5, "dt", "foo") grouped = df.groupby("id") - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.mean() mn = grouped.mean(numeric_only=True) @@ -772,7 +772,7 @@ def test_join_on_tz_aware_datetimeindex(self): ) result = df1.join(df2.set_index("date"), on="date") expected = df1.copy() - expected["vals_2"] = Series([np.nan] * 2 + list("tuv"), dtype=object) + expected["vals_2"] = Series([np.nan] * 2 + list("tuv")) tm.assert_frame_equal(result, expected) def test_join_datetime_string(self): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index d7a343ae9f152..685de3f45420b 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -8,7 +8,10 @@ import numpy as np import pytest -from pandas.core.dtypes.common import is_object_dtype +from pandas.core.dtypes.common import ( + is_object_dtype, + is_string_dtype, +) from pandas.core.dtypes.dtypes import CategoricalDtype import pandas as pd @@ -316,14 +319,15 @@ def test_merge_copy(self): merged["d"] = "peekaboo" assert (right["d"] == "bar").all() - def test_merge_nocopy(self, using_array_manager): + def test_merge_nocopy(self, using_infer_string): left = DataFrame({"a": 0, "b": 1}, index=range(10)) right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) merged = merge(left, right, left_index=True, right_index=True, copy=False) assert np.shares_memory(merged["a"]._values, left["a"]._values) - assert np.shares_memory(merged["d"]._values, right["d"]._values) + if not using_infer_string: + assert np.shares_memory(merged["d"]._values, right["d"]._values) def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame @@ -667,11 +671,13 @@ def test_merge_nan_right(self): "i1_": {0: 0, 1: np.nan}, "i3": {0: 0.0, 1: np.nan}, None: {0: 0, 1: 0}, - } + }, + columns=Index(["i1", "i2", "i1_", "i3", None], dtype=object), ) .set_index(None) .reset_index()[["i1", "i2", "i1_", "i3"]] ) + result.columns = result.columns.astype("object") tm.assert_frame_equal(result, expected, check_dtype=False) def test_merge_nan_right2(self): @@ -820,7 +826,7 @@ def test_overlapping_columns_error_message(self): # #2649, #10639 df2.columns = ["key1", "foo", "foo"] - msg = r"Data columns not unique: Index\(\['foo'\], dtype='object'\)" + msg = r"Data columns not unique: Index\(\['foo'\], dtype='object|string'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) @@ -1498,7 +1504,7 @@ def test_different(self, right_vals): # We allow merging on object and categorical cols and cast # categorical cols to object result = merge(left, right, on="A") - assert is_object_dtype(result.A.dtype) + assert is_object_dtype(result.A.dtype) or is_string_dtype(result.A.dtype) @pytest.mark.parametrize( "d1", [np.int64, np.int32, np.intc, np.int16, np.int8, np.uint8] @@ -1637,7 +1643,7 @@ def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals): result = merge(df1, df2, on=["A"]) assert is_object_dtype(result.A.dtype) result = merge(df2, df1, on=["A"]) - assert is_object_dtype(result.A.dtype) + assert is_object_dtype(result.A.dtype) or is_string_dtype(result.A.dtype) @pytest.mark.parametrize( "df1_vals, df2_vals", @@ -1867,25 +1873,27 @@ def right(): class TestMergeCategorical: - def test_identical(self, left): + def test_identical(self, left, using_infer_string): # merging on the same, should preserve dtypes merged = merge(left, left, on="X") result = merged.dtypes.sort_index() + dtype = np.dtype("O") if not using_infer_string else "string" expected = Series( - [CategoricalDtype(categories=["foo", "bar"]), np.dtype("O"), np.dtype("O")], + [CategoricalDtype(categories=["foo", "bar"]), dtype, dtype], index=["X", "Y_x", "Y_y"], ) tm.assert_series_equal(result, expected) - def test_basic(self, left, right): + def test_basic(self, left, right, using_infer_string): # we have matching Categorical dtypes in X # so should preserve the merged column merged = merge(left, right, on="X") result = merged.dtypes.sort_index() + dtype = np.dtype("O") if not using_infer_string else "string" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), - np.dtype("O"), + dtype, np.dtype("int64"), ], index=["X", "Y", "Z"], @@ -1989,16 +1997,17 @@ def test_multiindex_merge_with_unordered_categoricalindex(self, ordered): ).set_index(["id", "p"]) tm.assert_frame_equal(result, expected) - def test_other_columns(self, left, right): + def test_other_columns(self, left, right, using_infer_string): # non-merge columns should preserve if possible right = right.assign(Z=right.Z.astype("category")) merged = merge(left, right, on="X") result = merged.dtypes.sort_index() + dtype = np.dtype("O") if not using_infer_string else "string" expected = Series( [ CategoricalDtype(categories=["foo", "bar"]), - np.dtype("O"), + dtype, CategoricalDtype(categories=[1, 2]), ], index=["X", "Y", "Z"], @@ -2017,7 +2026,9 @@ def test_other_columns(self, left, right): lambda x: x.astype(CategoricalDtype(ordered=True)), ], ) - def test_dtype_on_merged_different(self, change, join_type, left, right): + def test_dtype_on_merged_different( + self, change, join_type, left, right, using_infer_string + ): # our merging columns, X now has 2 different dtypes # so we must be object as a result @@ -2029,9 +2040,8 @@ def test_dtype_on_merged_different(self, change, join_type, left, right): merged = merge(left, right, on="X", how=join_type) result = merged.dtypes.sort_index() - expected = Series( - [np.dtype("O"), np.dtype("O"), np.dtype("int64")], index=["X", "Y", "Z"] - ) + dtype = np.dtype("O") if not using_infer_string else "string" + expected = Series([dtype, dtype, np.dtype("int64")], index=["X", "Y", "Z"]) tm.assert_series_equal(result, expected) def test_self_join_multiple_categories(self): @@ -2499,7 +2509,7 @@ def test_merge_multiindex_columns(): expected_index = MultiIndex.from_tuples(tuples, names=["outer", "inner"]) expected = DataFrame(columns=expected_index) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_dtype=False) def test_merge_datetime_upcast_dtype(): diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 6d0a405430c9f..2b93aa89c03e2 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -3080,8 +3080,11 @@ def test_on_float_by_int(self): tm.assert_frame_equal(result, expected) - def test_merge_datatype_error_raises(self): - msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" + def test_merge_datatype_error_raises(self, using_infer_string): + if using_infer_string: + msg = "incompatible merge keys" + else: + msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" left = pd.DataFrame({"left_val": [1, 5, 10], "a": ["a", "b", "c"]}) right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7], "a": [1, 2, 3, 6, 7]}) @@ -3133,7 +3136,7 @@ def test_merge_on_nans(self, func, side): else: merge_asof(df, df_null, on="a") - def test_by_nullable(self, any_numeric_ea_dtype): + def test_by_nullable(self, any_numeric_ea_dtype, using_infer_string): # Note: this test passes if instead of using pd.array we use # np.array([np.nan, 1]). Other than that, I (@jbrockmendel) # have NO IDEA what the expected behavior is. @@ -3175,6 +3178,8 @@ def test_by_nullable(self, any_numeric_ea_dtype): } ) expected["value_y"] = np.array([np.nan, np.nan, np.nan], dtype=object) + if using_infer_string: + expected["value_y"] = expected["value_y"].astype("string[pyarrow_numpy]") tm.assert_frame_equal(result, expected) def test_merge_by_col_tz_aware(self): @@ -3200,7 +3205,7 @@ def test_merge_by_col_tz_aware(self): ) tm.assert_frame_equal(result, expected) - def test_by_mixed_tz_aware(self): + def test_by_mixed_tz_aware(self, using_infer_string): # GH 26649 left = pd.DataFrame( { @@ -3224,6 +3229,8 @@ def test_by_mixed_tz_aware(self): columns=["by_col1", "by_col2", "on_col", "value_x"], ) expected["value_y"] = np.array([np.nan], dtype=object) + if using_infer_string: + expected["value_y"] = expected["value_y"].astype("string[pyarrow_numpy]") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", ["float64", "int16", "m8[ns]", "M8[us]"]) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 269d3a2b7078e..c5d63c739121d 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -632,7 +632,7 @@ def test_join_multi_levels_outer(self, portfolio, household, expected): axis=0, sort=True, ).reindex(columns=expected.columns) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_column_type=False) def test_join_multi_levels_invalid(self, portfolio, household): portfolio = portfolio.copy() From 50dc8dc06a4b87b053c6e08b10c68d5eae4b3e1d Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 11 Dec 2023 01:56:01 +0100 Subject: [PATCH 5/5] Update --- pandas/tests/reshape/merge/test_multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index c5d63c739121d..60c10f883530b 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -632,7 +632,7 @@ def test_join_multi_levels_outer(self, portfolio, household, expected): axis=0, sort=True, ).reindex(columns=expected.columns) - tm.assert_frame_equal(result, expected, check_column_type=False) + tm.assert_frame_equal(result, expected, check_index_type=False) def test_join_multi_levels_invalid(self, portfolio, household): portfolio = portfolio.copy()