Skip to content

Commit 66a2111

Browse files
authored
BUG: Fix out-of-bounds access in safe_sort with an empty array and non-empty codes (pandas-dev#59489)
* Fix out-of-bounds violations in safe_sort for empty arrays. Previously we masked `codes` referring to out-of-bounds elements to 0 and then fixed them after to -1 using `np.putmask`. However, this results in out-of-bounds access in `take_nd` if the array is empty. Instead, set all out-of-bounds indices in `codes` to -1 immediately, as these can be handled by `take_nd`. * Remove dead code. `use_na_sentinel` cannot be truthy inside an else branch where it is falsy. * Add test based upon pandas-dev#59421
1 parent 3eb8d34 commit 66a2111

File tree

4 files changed

+21
-11
lines changed

4 files changed

+21
-11
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -621,6 +621,7 @@ Reshaping
621621
^^^^^^^^^
622622
- Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`)
623623
- Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`)
624+
- Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`)
624625
- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
625626
- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
626627

pandas/core/algorithms.py

+1-11
Original file line numberDiff line numberDiff line change
@@ -1529,9 +1529,7 @@ def safe_sort(
15291529
order2 = sorter.argsort()
15301530
if verify:
15311531
mask = (codes < -len(values)) | (codes >= len(values))
1532-
codes[mask] = 0
1533-
else:
1534-
mask = None
1532+
codes[mask] = -1
15351533
new_codes = take_nd(order2, codes, fill_value=-1)
15361534
else:
15371535
reverse_indexer = np.empty(len(sorter), dtype=int)
@@ -1540,14 +1538,6 @@ def safe_sort(
15401538
# may deal with them here without performance loss using `mode='wrap'`
15411539
new_codes = reverse_indexer.take(codes, mode="wrap")
15421540

1543-
if use_na_sentinel:
1544-
mask = codes == -1
1545-
if verify:
1546-
mask = mask | (codes < -len(values)) | (codes >= len(values))
1547-
1548-
if use_na_sentinel and mask is not None:
1549-
np.putmask(new_codes, mask, -1)
1550-
15511541
return ordered, ensure_platform_int(new_codes)
15521542

15531543

pandas/tests/reshape/merge/test_merge.py

+12
Original file line numberDiff line numberDiff line change
@@ -2998,3 +2998,15 @@ def test_merge_datetime_and_timedelta(how):
29982998
)
29992999
with pytest.raises(ValueError, match=re.escape(msg)):
30003000
right.merge(left, on="key", how=how)
3001+
3002+
3003+
def test_merge_on_all_nan_column():
3004+
# GH#59421
3005+
left = DataFrame({"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6]})
3006+
right = DataFrame({"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "zz": [4, 5, 6]})
3007+
result = left.merge(right, on=["x", "y"], how="outer")
3008+
# Should not trigger array bounds eerror with bounds checking or asan enabled.
3009+
expected = DataFrame(
3010+
{"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6], "zz": [4, 5, 6]}
3011+
)
3012+
tm.assert_frame_equal(result, expected)

pandas/tests/test_sorting.py

+7
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,13 @@ def test_codes_out_of_bound(self):
408408
tm.assert_numpy_array_equal(result, expected)
409409
tm.assert_numpy_array_equal(result_codes, expected_codes)
410410

411+
@pytest.mark.parametrize("codes", [[-1, -1], [2, -1], [2, 2]])
412+
def test_codes_empty_array_out_of_bound(self, codes):
413+
empty_values = np.array([])
414+
expected_codes = -np.ones_like(codes, dtype=np.intp)
415+
_, result_codes = safe_sort(empty_values, codes)
416+
tm.assert_numpy_array_equal(result_codes, expected_codes)
417+
411418
def test_mixed_integer(self):
412419
values = np.array(["b", 1, 0, "a", 0, "b"], dtype=object)
413420
result = safe_sort(values)

0 commit comments

Comments
 (0)