From 317f9bad18eca722dec5290403d54a459f8b20e8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 9 Jan 2024 01:05:57 +0100 Subject: [PATCH] BUG: merge not sorting for new string dtype (#56442) * BUG: merge not sorting for new string dtype * Fixup * Update test_multi.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> (cherry picked from commit b7e2202459eadc9dd599cbe58251ecc930798b97) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/reshape/merge.py | 18 +++- pandas/tests/reshape/merge/test_join.py | 43 ++++++---- pandas/tests/reshape/merge/test_multi.py | 100 ++++++++++++----------- 4 files changed, 94 insertions(+), 68 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 36e677fa2a7a9..6a232365fbfeb 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -893,6 +893,7 @@ Reshaping - Bug in :func:`merge_asof` when using a :class:`Timedelta` tolerance on a :class:`ArrowDtype` column (:issue:`56486`) - Bug in :func:`merge` not raising when merging datetime columns with timedelta columns (:issue:`56455`) - Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`) +- Bug in :func:`merge` not sorting for new string dtype (:issue:`56442`) - Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`) - Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`) - Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 320e4e33a29fb..410301b7697f2 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2488,18 +2488,30 @@ def _factorize_keys( .combine_chunks() .dictionary_encode() ) - length = len(dc.dictionary) llab, rlab, count = ( - pc.fill_null(dc.indices[slice(len_lk)], length) + pc.fill_null(dc.indices[slice(len_lk)], -1) .to_numpy() .astype(np.intp, copy=False), - pc.fill_null(dc.indices[slice(len_lk, None)], length) + pc.fill_null(dc.indices[slice(len_lk, None)], -1) .to_numpy() .astype(np.intp, copy=False), len(dc.dictionary), ) + + if sort: + uniques = dc.dictionary.to_numpy(zero_copy_only=False) + llab, rlab = _sort_labels(uniques, llab, rlab) + if dc.null_count > 0: + lmask = llab == -1 + lany = lmask.any() + rmask = rlab == -1 + rany = rmask.any() + if lany: + np.putmask(llab, lmask, count) + if rany: + np.putmask(rlab, rmask, count) count += 1 return llab, rlab, count diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 5a1f47e341222..1d5ed2d7373ce 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -16,6 +16,7 @@ bdate_range, concat, merge, + option_context, ) import pandas._testing as tm @@ -563,24 +564,30 @@ def test_join_many_non_unique_index(self): tm.assert_frame_equal(inner, left) tm.assert_frame_equal(inner, right) - def test_join_sort(self): - left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]}) - right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) - - joined = left.join(right, on="key", sort=True) - expected = DataFrame( - { - "key": ["bar", "baz", "foo", "foo"], - "value": [2, 3, 1, 4], - "value2": ["a", "b", "c", "c"], - }, - index=[1, 2, 0, 3], - ) - tm.assert_frame_equal(joined, expected) - - # smoke test - joined = left.join(right, on="key", sort=False) - tm.assert_index_equal(joined.index, Index(range(4)), exact=True) + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_join_sort(self, infer_string): + with option_context("future.infer_string", infer_string): + left = DataFrame( + {"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]} + ) + right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) + + joined = left.join(right, on="key", sort=True) + expected = DataFrame( + { + "key": ["bar", "baz", "foo", "foo"], + "value": [2, 3, 1, 4], + "value2": ["a", "b", "c", "c"], + }, + index=[1, 2, 0, 3], + ) + tm.assert_frame_equal(joined, expected) + + # smoke test + joined = left.join(right, on="key", sort=False) + tm.assert_index_equal(joined.index, Index(range(4)), exact=True) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index 269d3a2b7078e..5973f13c9d495 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -9,6 +11,7 @@ RangeIndex, Series, Timestamp, + option_context, ) import pandas._testing as tm from pandas.core.reshape.concat import concat @@ -88,67 +91,70 @@ def test_merge_on_multikey(self, left, right, join_type): tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("sort", [False, True]) - def test_left_join_multi_index(self, sort): - icols = ["1st", "2nd", "3rd"] + @pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] + ) + def test_left_join_multi_index(self, sort, infer_string): + with option_context("future.infer_string", infer_string): + icols = ["1st", "2nd", "3rd"] - def bind_cols(df): - iord = lambda a: 0 if a != a else ord(a) - f = lambda ts: ts.map(iord) - ord("a") - return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10 + def bind_cols(df): + iord = lambda a: 0 if a != a else ord(a) + f = lambda ts: ts.map(iord) - ord("a") + return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10 - def run_asserts(left, right, sort): - res = left.join(right, on=icols, how="left", sort=sort) + def run_asserts(left, right, sort): + res = left.join(right, on=icols, how="left", sort=sort) - assert len(left) < len(res) + 1 - assert not res["4th"].isna().any() - assert not res["5th"].isna().any() + assert len(left) < len(res) + 1 + assert not res["4th"].isna().any() + assert not res["5th"].isna().any() - tm.assert_series_equal(res["4th"], -res["5th"], check_names=False) - result = bind_cols(res.iloc[:, :-2]) - tm.assert_series_equal(res["4th"], result, check_names=False) - assert result.name is None + tm.assert_series_equal(res["4th"], -res["5th"], check_names=False) + result = bind_cols(res.iloc[:, :-2]) + tm.assert_series_equal(res["4th"], result, check_names=False) + assert result.name is None - if sort: - tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort")) + if sort: + tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort")) - out = merge(left, right.reset_index(), on=icols, sort=sort, how="left") + out = merge(left, right.reset_index(), on=icols, sort=sort, how="left") - res.index = RangeIndex(len(res)) - tm.assert_frame_equal(out, res) + res.index = RangeIndex(len(res)) + tm.assert_frame_equal(out, res) - lc = list(map(chr, np.arange(ord("a"), ord("z") + 1))) - left = DataFrame( - np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"] - ) - # Explicit cast to float to avoid implicit cast when setting nan - left.insert( - 1, - "2nd", - np.random.default_rng(2).integers(0, 10, len(left)).astype("float"), - ) + lc = list(map(chr, np.arange(ord("a"), ord("z") + 1))) + left = DataFrame( + np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"] + ) + # Explicit cast to float to avoid implicit cast when setting nan + left.insert( + 1, + "2nd", + np.random.default_rng(2).integers(0, 10, len(left)).astype("float"), + ) - i = np.random.default_rng(2).permutation(len(left)) - right = left.iloc[i].copy() + i = np.random.default_rng(2).permutation(len(left)) + right = left.iloc[i].copy() - left["4th"] = bind_cols(left) - right["5th"] = -bind_cols(right) - right.set_index(icols, inplace=True) + left["4th"] = bind_cols(left) + right["5th"] = -bind_cols(right) + right.set_index(icols, inplace=True) - run_asserts(left, right, sort) + run_asserts(left, right, sort) - # inject some nulls - left.loc[1::4, "1st"] = np.nan - left.loc[2::5, "2nd"] = np.nan - left.loc[3::6, "3rd"] = np.nan - left["4th"] = bind_cols(left) + # inject some nulls + left.loc[1::4, "1st"] = np.nan + left.loc[2::5, "2nd"] = np.nan + left.loc[3::6, "3rd"] = np.nan + left["4th"] = bind_cols(left) - i = np.random.default_rng(2).permutation(len(left)) - right = left.iloc[i, :-1] - right["5th"] = -bind_cols(right) - right.set_index(icols, inplace=True) + i = np.random.default_rng(2).permutation(len(left)) + right = left.iloc[i, :-1] + right["5th"] = -bind_cols(right) + right.set_index(icols, inplace=True) - run_asserts(left, right, sort) + run_asserts(left, right, sort) @pytest.mark.parametrize("sort", [False, True]) def test_merge_right_vs_left(self, left, right, sort):