Skip to content

Commit 3945d5e

Browse files
authored
Backport PR pandas-dev#5644 on branch 2.2.x (BUG: merge not sorting for new string dtype) (pandas-dev#56799)
BUG: merge not sorting for new string dtype (pandas-dev#56442) * BUG: merge not sorting for new string dtype * Fixup * Update test_multi.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> (cherry picked from commit b7e2202)
1 parent 3c89432 commit 3945d5e

File tree

4 files changed

+94
-68
lines changed

4 files changed

+94
-68
lines changed

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -893,6 +893,7 @@ Reshaping
893893
- Bug in :func:`merge_asof` when using a :class:`Timedelta` tolerance on a :class:`ArrowDtype` column (:issue:`56486`)
894894
- Bug in :func:`merge` not raising when merging datetime columns with timedelta columns (:issue:`56455`)
895895
- Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`)
896+
- Bug in :func:`merge` not sorting for new string dtype (:issue:`56442`)
896897
- Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`)
897898
- Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`)
898899
- Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`)

pandas/core/reshape/merge.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -2488,18 +2488,30 @@ def _factorize_keys(
24882488
.combine_chunks()
24892489
.dictionary_encode()
24902490
)
2491-
length = len(dc.dictionary)
24922491

24932492
llab, rlab, count = (
2494-
pc.fill_null(dc.indices[slice(len_lk)], length)
2493+
pc.fill_null(dc.indices[slice(len_lk)], -1)
24952494
.to_numpy()
24962495
.astype(np.intp, copy=False),
2497-
pc.fill_null(dc.indices[slice(len_lk, None)], length)
2496+
pc.fill_null(dc.indices[slice(len_lk, None)], -1)
24982497
.to_numpy()
24992498
.astype(np.intp, copy=False),
25002499
len(dc.dictionary),
25012500
)
2501+
2502+
if sort:
2503+
uniques = dc.dictionary.to_numpy(zero_copy_only=False)
2504+
llab, rlab = _sort_labels(uniques, llab, rlab)
2505+
25022506
if dc.null_count > 0:
2507+
lmask = llab == -1
2508+
lany = lmask.any()
2509+
rmask = rlab == -1
2510+
rany = rmask.any()
2511+
if lany:
2512+
np.putmask(llab, lmask, count)
2513+
if rany:
2514+
np.putmask(rlab, rmask, count)
25032515
count += 1
25042516
return llab, rlab, count
25052517

pandas/tests/reshape/merge/test_join.py

+25-18
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
bdate_range,
1717
concat,
1818
merge,
19+
option_context,
1920
)
2021
import pandas._testing as tm
2122

@@ -563,24 +564,30 @@ def test_join_many_non_unique_index(self):
563564
tm.assert_frame_equal(inner, left)
564565
tm.assert_frame_equal(inner, right)
565566

566-
def test_join_sort(self):
567-
left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]})
568-
right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"])
569-
570-
joined = left.join(right, on="key", sort=True)
571-
expected = DataFrame(
572-
{
573-
"key": ["bar", "baz", "foo", "foo"],
574-
"value": [2, 3, 1, 4],
575-
"value2": ["a", "b", "c", "c"],
576-
},
577-
index=[1, 2, 0, 3],
578-
)
579-
tm.assert_frame_equal(joined, expected)
580-
581-
# smoke test
582-
joined = left.join(right, on="key", sort=False)
583-
tm.assert_index_equal(joined.index, Index(range(4)), exact=True)
567+
@pytest.mark.parametrize(
568+
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
569+
)
570+
def test_join_sort(self, infer_string):
571+
with option_context("future.infer_string", infer_string):
572+
left = DataFrame(
573+
{"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]}
574+
)
575+
right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"])
576+
577+
joined = left.join(right, on="key", sort=True)
578+
expected = DataFrame(
579+
{
580+
"key": ["bar", "baz", "foo", "foo"],
581+
"value": [2, 3, 1, 4],
582+
"value2": ["a", "b", "c", "c"],
583+
},
584+
index=[1, 2, 0, 3],
585+
)
586+
tm.assert_frame_equal(joined, expected)
587+
588+
# smoke test
589+
joined = left.join(right, on="key", sort=False)
590+
tm.assert_index_equal(joined.index, Index(range(4)), exact=True)
584591

585592
def test_join_mixed_non_unique_index(self):
586593
# GH 12814, unorderable types in py3 with a non-unique index

pandas/tests/reshape/merge/test_multi.py

+53-47
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import numpy as np
22
import pytest
33

4+
import pandas.util._test_decorators as td
5+
46
import pandas as pd
57
from pandas import (
68
DataFrame,
@@ -9,6 +11,7 @@
911
RangeIndex,
1012
Series,
1113
Timestamp,
14+
option_context,
1215
)
1316
import pandas._testing as tm
1417
from pandas.core.reshape.concat import concat
@@ -88,67 +91,70 @@ def test_merge_on_multikey(self, left, right, join_type):
8891

8992
tm.assert_frame_equal(result, expected)
9093

91-
@pytest.mark.parametrize("sort", [False, True])
92-
def test_left_join_multi_index(self, sort):
93-
icols = ["1st", "2nd", "3rd"]
94+
@pytest.mark.parametrize(
95+
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
96+
)
97+
def test_left_join_multi_index(self, sort, infer_string):
98+
with option_context("future.infer_string", infer_string):
99+
icols = ["1st", "2nd", "3rd"]
94100

95-
def bind_cols(df):
96-
iord = lambda a: 0 if a != a else ord(a)
97-
f = lambda ts: ts.map(iord) - ord("a")
98-
return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10
101+
def bind_cols(df):
102+
iord = lambda a: 0 if a != a else ord(a)
103+
f = lambda ts: ts.map(iord) - ord("a")
104+
return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10
99105

100-
def run_asserts(left, right, sort):
101-
res = left.join(right, on=icols, how="left", sort=sort)
106+
def run_asserts(left, right, sort):
107+
res = left.join(right, on=icols, how="left", sort=sort)
102108

103-
assert len(left) < len(res) + 1
104-
assert not res["4th"].isna().any()
105-
assert not res["5th"].isna().any()
109+
assert len(left) < len(res) + 1
110+
assert not res["4th"].isna().any()
111+
assert not res["5th"].isna().any()
106112

107-
tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
108-
result = bind_cols(res.iloc[:, :-2])
109-
tm.assert_series_equal(res["4th"], result, check_names=False)
110-
assert result.name is None
113+
tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
114+
result = bind_cols(res.iloc[:, :-2])
115+
tm.assert_series_equal(res["4th"], result, check_names=False)
116+
assert result.name is None
111117

112-
if sort:
113-
tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort"))
118+
if sort:
119+
tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort"))
114120

115-
out = merge(left, right.reset_index(), on=icols, sort=sort, how="left")
121+
out = merge(left, right.reset_index(), on=icols, sort=sort, how="left")
116122

117-
res.index = RangeIndex(len(res))
118-
tm.assert_frame_equal(out, res)
123+
res.index = RangeIndex(len(res))
124+
tm.assert_frame_equal(out, res)
119125

120-
lc = list(map(chr, np.arange(ord("a"), ord("z") + 1)))
121-
left = DataFrame(
122-
np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"]
123-
)
124-
# Explicit cast to float to avoid implicit cast when setting nan
125-
left.insert(
126-
1,
127-
"2nd",
128-
np.random.default_rng(2).integers(0, 10, len(left)).astype("float"),
129-
)
126+
lc = list(map(chr, np.arange(ord("a"), ord("z") + 1)))
127+
left = DataFrame(
128+
np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"]
129+
)
130+
# Explicit cast to float to avoid implicit cast when setting nan
131+
left.insert(
132+
1,
133+
"2nd",
134+
np.random.default_rng(2).integers(0, 10, len(left)).astype("float"),
135+
)
130136

131-
i = np.random.default_rng(2).permutation(len(left))
132-
right = left.iloc[i].copy()
137+
i = np.random.default_rng(2).permutation(len(left))
138+
right = left.iloc[i].copy()
133139

134-
left["4th"] = bind_cols(left)
135-
right["5th"] = -bind_cols(right)
136-
right.set_index(icols, inplace=True)
140+
left["4th"] = bind_cols(left)
141+
right["5th"] = -bind_cols(right)
142+
right.set_index(icols, inplace=True)
137143

138-
run_asserts(left, right, sort)
144+
run_asserts(left, right, sort)
139145

140-
# inject some nulls
141-
left.loc[1::4, "1st"] = np.nan
142-
left.loc[2::5, "2nd"] = np.nan
143-
left.loc[3::6, "3rd"] = np.nan
144-
left["4th"] = bind_cols(left)
146+
# inject some nulls
147+
left.loc[1::4, "1st"] = np.nan
148+
left.loc[2::5, "2nd"] = np.nan
149+
left.loc[3::6, "3rd"] = np.nan
150+
left["4th"] = bind_cols(left)
145151

146-
i = np.random.default_rng(2).permutation(len(left))
147-
right = left.iloc[i, :-1]
148-
right["5th"] = -bind_cols(right)
149-
right.set_index(icols, inplace=True)
152+
i = np.random.default_rng(2).permutation(len(left))
153+
right = left.iloc[i, :-1]
154+
right["5th"] = -bind_cols(right)
155+
right.set_index(icols, inplace=True)
150156

151-
run_asserts(left, right, sort)
157+
run_asserts(left, right, sort)
152158

153159
@pytest.mark.parametrize("sort", [False, True])
154160
def test_merge_right_vs_left(self, left, right, sort):

0 commit comments

Comments
 (0)