Skip to content

Commit 1320ff1

Browse files
committed
Fixed #36562
* Use special sorting comparator for tuple arrays which can be created when consolidate_first is called on DataFrames with MultiIndex which contain nan and string values
1 parent 6db851b commit 1320ff1

File tree

4 files changed

+76
-5
lines changed

4 files changed

+76
-5
lines changed

doc/source/whatsnew/v1.2.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,8 @@ MultiIndex
363363
^^^^^^^^^^
364364

365365
- Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message ``"Expected label or tuple of labels"`` (:issue:`35301`)
366-
-
366+
- Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` with message ``"'<' not supported between instances of 'float' and 'str'"`` (:issue:`36562`)
367+
367368

368369
I/O
369370
^^^

pandas/core/algorithms.py

+44-4
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from __future__ import annotations
66

77
import operator
8+
import functools
89
from textwrap import dedent
910
from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union, cast
1011
from warnings import catch_warnings, simplefilter, warn
@@ -2055,13 +2056,52 @@ def sort_mixed(values):
20552056
strs = np.sort(values[str_pos])
20562057
return np.concatenate([nums, np.asarray(strs, dtype=object)])
20572058

2059+
def sort_tuples(values):
2060+
# sorts tuples with mixed values. can handle nan vs string comparisons.
2061+
def cmp_func(index_x, index_y):
2062+
x = values[index_x]
2063+
y = values[index_y]
2064+
if x == y:
2065+
return 0
2066+
len_x = len(x)
2067+
len_y = len(y)
2068+
for i in range(max(len_x, len_y)):
2069+
# check if the tuples have different lengths (shorter tuples
2070+
# first)
2071+
if i >= len_x:
2072+
return -1
2073+
if i >= len_y:
2074+
return +1
2075+
x_i_na = isna(x[i])
2076+
y_i_na = isna(y[i])
2077+
# values are the same -> resolve tie with next element
2078+
if (x_i_na and y_i_na) or (x[i] == y[i]):
2079+
continue
2080+
# check for nan values (sort nan to the end which is consistent
2081+
# with numpy
2082+
if x_i_na and not y_i_na:
2083+
return +1
2084+
if not x_i_na and y_i_na:
2085+
return -1
2086+
# normal greater/less than comparison
2087+
if x[i] < y[i]:
2088+
return -1
2089+
return +1
2090+
return 0
2091+
2092+
ixs = np.arange(len(values))
2093+
ixs = sorted(ixs, key=functools.cmp_to_key(cmp_func))
2094+
return values[ixs]
2095+
20582096
sorter = None
2059-
if (
2060-
not is_extension_array_dtype(values)
2061-
and lib.infer_dtype(values, skipna=False) == "mixed-integer"
2062-
):
2097+
2098+
ext_arr = is_extension_array_dtype(values)
2099+
if not ext_arr and lib.infer_dtype(values, skipna=False) == "mixed-integer":
20632100
# unorderable in py3 if mixed str/int
20642101
ordered = sort_mixed(values)
2102+
elif not ext_arr and values.size and isinstance(values[0], tuple):
2103+
# 1-D arrays with tuples of potentially mixed type (solves GH36562)
2104+
ordered = sort_tuples(values)
20652105
else:
20662106
try:
20672107
sorter = values.argsort()

pandas/tests/indexing/multiindex/test_multiindex.py

+23
Original file line numberDiff line numberDiff line change
@@ -91,3 +91,26 @@ def test_multiindex_get_loc_list_raises(self):
9191
msg = "unhashable type"
9292
with pytest.raises(TypeError, match=msg):
9393
idx.get_loc([])
94+
95+
96+
def test_combine_first_with_nan_index():
97+
mi1 = pd.MultiIndex.from_arrays(
98+
[["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]],
99+
names=["a", "b"]
100+
)
101+
df = pd.DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1)
102+
mi2 = pd.MultiIndex.from_arrays(
103+
[["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"]
104+
)
105+
s = pd.Series([1, 2, 3, 4, 5, 6], index=mi2)
106+
df_combined = df.combine_first(pd.DataFrame({"col": s}))
107+
mi_expected = pd.MultiIndex.from_arrays([
108+
["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan],
109+
[1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6,]
110+
], names=["a", "b"])
111+
assert (df_combined.index == mi_expected).all()
112+
exp_col = np.asarray(
113+
[1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan]
114+
)
115+
act_col = df_combined['col'].values
116+
assert np.allclose(act_col, exp_col, rtol=0, atol=0, equal_nan=True)

pandas/tests/test_sorting.py

+7
Original file line numberDiff line numberDiff line change
@@ -453,3 +453,10 @@ def test_extension_array_codes(self, verify, na_sentinel):
453453
expected_codes = np.array([0, 2, na_sentinel, 1], dtype=np.intp)
454454
tm.assert_extension_array_equal(result, expected_values)
455455
tm.assert_numpy_array_equal(codes, expected_codes)
456+
457+
458+
def test_mixed_str_nan():
459+
values = np.array(["b", np.nan, "a", "b"], dtype=object)
460+
result = safe_sort(values)
461+
expected = np.array([np.nan, "a", "b", "b"], dtype=object)
462+
tm.assert_numpy_array_equal(result, expected)

0 commit comments

Comments
 (0)