Skip to content

Commit b7e2202

Browse files
BUG: merge not sorting for new string dtype (#56442)
* BUG: merge not sorting for new string dtype * Fixup * Update test_multi.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent b293fac commit b7e2202

File tree

4 files changed

+93
-66
lines changed

4 files changed

+93
-66
lines changed

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -893,6 +893,7 @@ Reshaping
893893
- Bug in :func:`merge_asof` when using a :class:`Timedelta` tolerance on a :class:`ArrowDtype` column (:issue:`56486`)
894894
- Bug in :func:`merge` not raising when merging datetime columns with timedelta columns (:issue:`56455`)
895895
- Bug in :func:`merge` not raising when merging string columns with numeric columns (:issue:`56441`)
896+
- Bug in :func:`merge` not sorting for new string dtype (:issue:`56442`)
896897
- Bug in :func:`merge` returning columns in incorrect order when left and/or right is empty (:issue:`51929`)
897898
- Bug in :meth:`DataFrame.melt` where an exception was raised if ``var_name`` was not a string (:issue:`55948`)
898899
- Bug in :meth:`DataFrame.melt` where it would not preserve the datetime (:issue:`55254`)

pandas/core/reshape/merge.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -2488,18 +2488,30 @@ def _factorize_keys(
24882488
.combine_chunks()
24892489
.dictionary_encode()
24902490
)
2491-
length = len(dc.dictionary)
24922491

24932492
llab, rlab, count = (
2494-
pc.fill_null(dc.indices[slice(len_lk)], length)
2493+
pc.fill_null(dc.indices[slice(len_lk)], -1)
24952494
.to_numpy()
24962495
.astype(np.intp, copy=False),
2497-
pc.fill_null(dc.indices[slice(len_lk, None)], length)
2496+
pc.fill_null(dc.indices[slice(len_lk, None)], -1)
24982497
.to_numpy()
24992498
.astype(np.intp, copy=False),
25002499
len(dc.dictionary),
25012500
)
2501+
2502+
if sort:
2503+
uniques = dc.dictionary.to_numpy(zero_copy_only=False)
2504+
llab, rlab = _sort_labels(uniques, llab, rlab)
2505+
25022506
if dc.null_count > 0:
2507+
lmask = llab == -1
2508+
lany = lmask.any()
2509+
rmask = rlab == -1
2510+
rany = rmask.any()
2511+
if lany:
2512+
np.putmask(llab, lmask, count)
2513+
if rany:
2514+
np.putmask(rlab, rmask, count)
25032515
count += 1
25042516
return llab, rlab, count
25052517

pandas/tests/reshape/merge/test_join.py

+25-18
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
bdate_range,
1717
concat,
1818
merge,
19+
option_context,
1920
)
2021
import pandas._testing as tm
2122

@@ -562,24 +563,30 @@ def test_join_many_non_unique_index(self):
562563
tm.assert_frame_equal(inner, left)
563564
tm.assert_frame_equal(inner, right)
564565

565-
def test_join_sort(self):
566-
left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]})
567-
right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"])
568-
569-
joined = left.join(right, on="key", sort=True)
570-
expected = DataFrame(
571-
{
572-
"key": ["bar", "baz", "foo", "foo"],
573-
"value": [2, 3, 1, 4],
574-
"value2": ["a", "b", "c", "c"],
575-
},
576-
index=[1, 2, 0, 3],
577-
)
578-
tm.assert_frame_equal(joined, expected)
579-
580-
# smoke test
581-
joined = left.join(right, on="key", sort=False)
582-
tm.assert_index_equal(joined.index, Index(range(4)), exact=True)
566+
@pytest.mark.parametrize(
567+
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
568+
)
569+
def test_join_sort(self, infer_string):
570+
with option_context("future.infer_string", infer_string):
571+
left = DataFrame(
572+
{"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]}
573+
)
574+
right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"])
575+
576+
joined = left.join(right, on="key", sort=True)
577+
expected = DataFrame(
578+
{
579+
"key": ["bar", "baz", "foo", "foo"],
580+
"value": [2, 3, 1, 4],
581+
"value2": ["a", "b", "c", "c"],
582+
},
583+
index=[1, 2, 0, 3],
584+
)
585+
tm.assert_frame_equal(joined, expected)
586+
587+
# smoke test
588+
joined = left.join(right, on="key", sort=False)
589+
tm.assert_index_equal(joined.index, Index(range(4)), exact=True)
583590

584591
def test_join_mixed_non_unique_index(self):
585592
# GH 12814, unorderable types in py3 with a non-unique index

pandas/tests/reshape/merge/test_multi.py

+52-45
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import numpy as np
22
import pytest
33

4+
import pandas.util._test_decorators as td
5+
46
import pandas as pd
57
from pandas import (
68
DataFrame,
@@ -9,6 +11,7 @@
911
RangeIndex,
1012
Series,
1113
Timestamp,
14+
option_context,
1215
)
1316
import pandas._testing as tm
1417
from pandas.core.reshape.concat import concat
@@ -88,64 +91,68 @@ def test_merge_on_multikey(self, left, right, join_type):
8891

8992
tm.assert_frame_equal(result, expected)
9093

91-
def test_left_join_multi_index(self, sort):
92-
icols = ["1st", "2nd", "3rd"]
94+
@pytest.mark.parametrize(
95+
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
96+
)
97+
def test_left_join_multi_index(self, sort, infer_string):
98+
with option_context("future.infer_string", infer_string):
99+
icols = ["1st", "2nd", "3rd"]
93100

94-
def bind_cols(df):
95-
iord = lambda a: 0 if a != a else ord(a)
96-
f = lambda ts: ts.map(iord) - ord("a")
97-
return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10
101+
def bind_cols(df):
102+
iord = lambda a: 0 if a != a else ord(a)
103+
f = lambda ts: ts.map(iord) - ord("a")
104+
return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10
98105

99-
def run_asserts(left, right, sort):
100-
res = left.join(right, on=icols, how="left", sort=sort)
106+
def run_asserts(left, right, sort):
107+
res = left.join(right, on=icols, how="left", sort=sort)
101108

102-
assert len(left) < len(res) + 1
103-
assert not res["4th"].isna().any()
104-
assert not res["5th"].isna().any()
109+
assert len(left) < len(res) + 1
110+
assert not res["4th"].isna().any()
111+
assert not res["5th"].isna().any()
105112

106-
tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
107-
result = bind_cols(res.iloc[:, :-2])
108-
tm.assert_series_equal(res["4th"], result, check_names=False)
109-
assert result.name is None
113+
tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
114+
result = bind_cols(res.iloc[:, :-2])
115+
tm.assert_series_equal(res["4th"], result, check_names=False)
116+
assert result.name is None
110117

111-
if sort:
112-
tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort"))
118+
if sort:
119+
tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort"))
113120

114-
out = merge(left, right.reset_index(), on=icols, sort=sort, how="left")
121+
out = merge(left, right.reset_index(), on=icols, sort=sort, how="left")
115122

116-
res.index = RangeIndex(len(res))
117-
tm.assert_frame_equal(out, res)
123+
res.index = RangeIndex(len(res))
124+
tm.assert_frame_equal(out, res)
118125

119-
lc = list(map(chr, np.arange(ord("a"), ord("z") + 1)))
120-
left = DataFrame(
121-
np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"]
122-
)
123-
# Explicit cast to float to avoid implicit cast when setting nan
124-
left.insert(
125-
1,
126-
"2nd",
127-
np.random.default_rng(2).integers(0, 10, len(left)).astype("float"),
128-
)
129-
right = left.sample(frac=1, random_state=np.random.default_rng(2))
126+
lc = list(map(chr, np.arange(ord("a"), ord("z") + 1)))
127+
left = DataFrame(
128+
np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"]
129+
)
130+
# Explicit cast to float to avoid implicit cast when setting nan
131+
left.insert(
132+
1,
133+
"2nd",
134+
np.random.default_rng(2).integers(0, 10, len(left)).astype("float"),
135+
)
136+
right = left.sample(frac=1, random_state=np.random.default_rng(2))
130137

131-
left["4th"] = bind_cols(left)
132-
right["5th"] = -bind_cols(right)
133-
right.set_index(icols, inplace=True)
138+
left["4th"] = bind_cols(left)
139+
right["5th"] = -bind_cols(right)
140+
right.set_index(icols, inplace=True)
134141

135-
run_asserts(left, right, sort)
142+
run_asserts(left, right, sort)
136143

137-
# inject some nulls
138-
left.loc[1::4, "1st"] = np.nan
139-
left.loc[2::5, "2nd"] = np.nan
140-
left.loc[3::6, "3rd"] = np.nan
141-
left["4th"] = bind_cols(left)
144+
# inject some nulls
145+
left.loc[1::4, "1st"] = np.nan
146+
left.loc[2::5, "2nd"] = np.nan
147+
left.loc[3::6, "3rd"] = np.nan
148+
left["4th"] = bind_cols(left)
142149

143-
i = np.random.default_rng(2).permutation(len(left))
144-
right = left.iloc[i, :-1]
145-
right["5th"] = -bind_cols(right)
146-
right.set_index(icols, inplace=True)
150+
i = np.random.default_rng(2).permutation(len(left))
151+
right = left.iloc[i, :-1]
152+
right["5th"] = -bind_cols(right)
153+
right.set_index(icols, inplace=True)
147154

148-
run_asserts(left, right, sort)
155+
run_asserts(left, right, sort)
149156

150157
def test_merge_right_vs_left(self, left, right, sort):
151158
# compare left vs right merge with multikey

0 commit comments

Comments
 (0)