BUG: merge not sorting for new string dtype (#56442)

phofl · pre-commit-ci[bot] · web-flow · commit b7e2202459ea · 2024-01-08T16:05:57.000-08:00
* BUG: merge not sorting for new string dtype * Fixup * Update test_multi.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -893,6 +893,7 @@ Reshaping
 - Bug in :func:`merge_asof` when using a :class:`Timedelta` tolerance on a :class:`ArrowDtype` column (:issue:`56486`)
 - Bug in :func:`merge` not raising when merging datetime columns with timedelta columns (:issue:`56455`)
 - Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`)
+- Bug in :func:`merge` not sorting for new string dtype (:issue:`56442`)
 - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`)
 - Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`)
 - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`)
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -2488,18 +2488,30 @@ def _factorize_keys(
                 .combine_chunks()
                 .dictionary_encode()
             )
-            length = len(dc.dictionary)
 
             llab, rlab, count = (
-                pc.fill_null(dc.indices[slice(len_lk)], length)
+                pc.fill_null(dc.indices[slice(len_lk)], -1)
                 .to_numpy()
                 .astype(np.intp, copy=False),
-                pc.fill_null(dc.indices[slice(len_lk, None)], length)
+                pc.fill_null(dc.indices[slice(len_lk, None)], -1)
                 .to_numpy()
                 .astype(np.intp, copy=False),
                 len(dc.dictionary),
             )
+
+            if sort:
+                uniques = dc.dictionary.to_numpy(zero_copy_only=False)
+                llab, rlab = _sort_labels(uniques, llab, rlab)
+
             if dc.null_count > 0:
+                lmask = llab == -1
+                lany = lmask.any()
+                rmask = rlab == -1
+                rany = rmask.any()
+                if lany:
+                    np.putmask(llab, lmask, count)
+                if rany:
+                    np.putmask(rlab, rmask, count)
                 count += 1
             return llab, rlab, count
 
diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py
@@ -16,6 +16,7 @@
     bdate_range,
     concat,
     merge,
+    option_context,
 )
 import pandas._testing as tm
 
@@ -562,24 +563,30 @@ def test_join_many_non_unique_index(self):
         tm.assert_frame_equal(inner, left)
         tm.assert_frame_equal(inner, right)
 
-    def test_join_sort(self):
-        left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]})
-        right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"])
-
-        joined = left.join(right, on="key", sort=True)
-        expected = DataFrame(
-            {
-                "key": ["bar", "baz", "foo", "foo"],
-                "value": [2, 3, 1, 4],
-                "value2": ["a", "b", "c", "c"],
-            },
-            index=[1, 2, 0, 3],
-        )
-        tm.assert_frame_equal(joined, expected)
-
-        # smoke test
-        joined = left.join(right, on="key", sort=False)
-        tm.assert_index_equal(joined.index, Index(range(4)), exact=True)
+    @pytest.mark.parametrize(
+        "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
+    )
+    def test_join_sort(self, infer_string):
+        with option_context("future.infer_string", infer_string):
+            left = DataFrame(
+                {"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]}
+            )
+            right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"])
+
+            joined = left.join(right, on="key", sort=True)
+            expected = DataFrame(
+                {
+                    "key": ["bar", "baz", "foo", "foo"],
+                    "value": [2, 3, 1, 4],
+                    "value2": ["a", "b", "c", "c"],
+                },
+                index=[1, 2, 0, 3],
+            )
+            tm.assert_frame_equal(joined, expected)
+
+            # smoke test
+            joined = left.join(right, on="key", sort=False)
+            tm.assert_index_equal(joined.index, Index(range(4)), exact=True)
 
     def test_join_mixed_non_unique_index(self):
         # GH 12814, unorderable types in py3 with a non-unique index
diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -9,6 +11,7 @@
     RangeIndex,
     Series,
     Timestamp,
+    option_context,
 )
 import pandas._testing as tm
 from pandas.core.reshape.concat import concat
@@ -88,64 +91,68 @@ def test_merge_on_multikey(self, left, right, join_type):
 
         tm.assert_frame_equal(result, expected)
 
-    def test_left_join_multi_index(self, sort):
-        icols = ["1st", "2nd", "3rd"]
+    @pytest.mark.parametrize(
+        "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
+    )
+    def test_left_join_multi_index(self, sort, infer_string):
+        with option_context("future.infer_string", infer_string):
+            icols = ["1st", "2nd", "3rd"]
 
-        def bind_cols(df):
-            iord = lambda a: 0 if a != a else ord(a)
-            f = lambda ts: ts.map(iord) - ord("a")
-            return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10
+            def bind_cols(df):
+                iord = lambda a: 0 if a != a else ord(a)
+                f = lambda ts: ts.map(iord) - ord("a")
+                return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10
 
-        def run_asserts(left, right, sort):
-            res = left.join(right, on=icols, how="left", sort=sort)
+            def run_asserts(left, right, sort):
+                res = left.join(right, on=icols, how="left", sort=sort)
 
-            assert len(left) < len(res) + 1
-            assert not res["4th"].isna().any()
-            assert not res["5th"].isna().any()
+                assert len(left) < len(res) + 1
+                assert not res["4th"].isna().any()
+                assert not res["5th"].isna().any()
 
-            tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
-            result = bind_cols(res.iloc[:, :-2])
-            tm.assert_series_equal(res["4th"], result, check_names=False)
-            assert result.name is None
+                tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
+                result = bind_cols(res.iloc[:, :-2])
+                tm.assert_series_equal(res["4th"], result, check_names=False)
+                assert result.name is None
 
-            if sort:
-                tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort"))
+                if sort:
+                    tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort"))
 
-            out = merge(left, right.reset_index(), on=icols, sort=sort, how="left")
+                out = merge(left, right.reset_index(), on=icols, sort=sort, how="left")
 
-            res.index = RangeIndex(len(res))
-            tm.assert_frame_equal(out, res)
+                res.index = RangeIndex(len(res))
+                tm.assert_frame_equal(out, res)
 
-        lc = list(map(chr, np.arange(ord("a"), ord("z") + 1)))
-        left = DataFrame(
-            np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"]
-        )
-        # Explicit cast to float to avoid implicit cast when setting nan
-        left.insert(
-            1,
-            "2nd",
-            np.random.default_rng(2).integers(0, 10, len(left)).astype("float"),
-        )
-        right = left.sample(frac=1, random_state=np.random.default_rng(2))
+            lc = list(map(chr, np.arange(ord("a"), ord("z") + 1)))
+            left = DataFrame(
+                np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"]
+            )
+            # Explicit cast to float to avoid implicit cast when setting nan
+            left.insert(
+                1,
+                "2nd",
+                np.random.default_rng(2).integers(0, 10, len(left)).astype("float"),
+            )
+            right = left.sample(frac=1, random_state=np.random.default_rng(2))
 
-        left["4th"] = bind_cols(left)
-        right["5th"] = -bind_cols(right)
-        right.set_index(icols, inplace=True)
+            left["4th"] = bind_cols(left)
+            right["5th"] = -bind_cols(right)
+            right.set_index(icols, inplace=True)
 
-        run_asserts(left, right, sort)
+            run_asserts(left, right, sort)
 
-        # inject some nulls
-        left.loc[1::4, "1st"] = np.nan
-        left.loc[2::5, "2nd"] = np.nan
-        left.loc[3::6, "3rd"] = np.nan
-        left["4th"] = bind_cols(left)
+            # inject some nulls
+            left.loc[1::4, "1st"] = np.nan
+            left.loc[2::5, "2nd"] = np.nan
+            left.loc[3::6, "3rd"] = np.nan
+            left["4th"] = bind_cols(left)
 
-        i = np.random.default_rng(2).permutation(len(left))
-        right = left.iloc[i, :-1]
-        right["5th"] = -bind_cols(right)
-        right.set_index(icols, inplace=True)
+            i = np.random.default_rng(2).permutation(len(left))
+            right = left.iloc[i, :-1]
+            right["5th"] = -bind_cols(right)
+            right.set_index(icols, inplace=True)
 
-        run_asserts(left, right, sort)
+            run_asserts(left, right, sort)
 
     def test_merge_right_vs_left(self, left, right, sort):
         # compare left vs right merge with multikey