Skip to content

Gh 36562 typeerror comparison not supported between float and str #37096

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
Nov 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
1320ff1
Fixed #36562
ssche Oct 13, 2020
a1b9385
Merge remote-tracking branch 'upstream/master' into gh-36562-typeerro…
ssche Oct 13, 2020
1f76d21
Fixed flake issue
ssche Oct 13, 2020
7076841
fixed linting errors
ssche Oct 13, 2020
225675d
Addressed review comments
ssche Oct 14, 2020
2677166
Make `sort_tuples` last resort sorting strategy to allow faster sorte…
ssche Oct 16, 2020
6f476bc
Merge remote-tracking branch 'upstream/master' into gh-36562-typeerro…
ssche Oct 16, 2020
3688238
manually apply pandas black changes (from CI)
ssche Oct 16, 2020
aba429c
Merge remote-tracking branch 'upstream/master' into gh-36562-typeerro…
ssche Oct 16, 2020
8ae9279
Modified changelog message as per review comments
ssche Oct 21, 2020
dd5a38d
Merge remote-tracking branch 'upstream/master' into gh-36562-typeerro…
ssche Oct 21, 2020
7b7d6f8
Removed pd.xyz
ssche Oct 22, 2020
4226662
Merge remote-tracking branch 'upstream/master' into gh-36562-typeerro…
ssche Oct 22, 2020
8cbfa01
forgot `pd`
ssche Oct 22, 2020
6d71000
Merge remote-tracking branch 'upstream/master' into gh-36562-typeerro…
ssche Oct 26, 2020
92e1e33
Changed sorting algorithm by using expanded array
ssche Oct 31, 2020
3ada9ce
Merge remote-tracking branch 'upstream/master' into gh-36562-typeerro…
ssche Oct 31, 2020
95670f1
Add ticket ref to test case
ssche Oct 31, 2020
d9dcd22
sort import
ssche Oct 31, 2020
66c30c7
Address review comments
ssche Oct 31, 2020
db66528
Merge remote-tracking branch 'upstream/master' into gh-36562-typeerro…
ssche Oct 31, 2020
16ae4f4
Merge remote-tracking branch 'upstream/master' into gh-36562-typeerro…
ssche Nov 3, 2020
9d03dc3
Fixed: committed file was in merge state
ssche Nov 3, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,7 @@ MultiIndex

- Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message ``"Expected label or tuple of labels"`` (:issue:`35301`)
- Bug in :meth:`DataFrame.reset_index` with ``NaT`` values in index raises ``ValueError`` with message ``"cannot convert float NaN to integer"`` (:issue:`36541`)
- Bug in :meth:`DataFrame.combine_first` when used with :class:`MultiIndex` containing string and ``NaN`` values raises ``TypeError`` (:issue:`36562`)

I/O
^^^
Expand Down
43 changes: 32 additions & 11 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2061,27 +2061,25 @@ def safe_sort(
dtype, _ = infer_dtype_from_array(values)
values = np.asarray(values, dtype=dtype)

def sort_mixed(values):
# order ints before strings, safe in py3
str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
nums = np.sort(values[~str_pos])
strs = np.sort(values[str_pos])
return np.concatenate([nums, np.asarray(strs, dtype=object)])

sorter = None

if (
not is_extension_array_dtype(values)
and lib.infer_dtype(values, skipna=False) == "mixed-integer"
):
# unorderable in py3 if mixed str/int
ordered = sort_mixed(values)
ordered = _sort_mixed(values)
else:
try:
sorter = values.argsort()
ordered = values.take(sorter)
except TypeError:
# try this anyway
ordered = sort_mixed(values)
# Previous sorters failed or were not applicable, try `_sort_mixed`
# which would work, but which fails for special case of 1d arrays
# with tuples.
if values.size and isinstance(values[0], tuple):
ordered = _sort_tuples(values)
else:
ordered = _sort_mixed(values)

# codes:

Expand Down Expand Up @@ -2128,3 +2126,26 @@ def sort_mixed(values):
np.putmask(new_codes, mask, na_sentinel)

return ordered, ensure_platform_int(new_codes)


def _sort_mixed(values):
""" order ints before strings in 1d arrays, safe in py3 """
str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
nums = np.sort(values[~str_pos])
strs = np.sort(values[str_pos])
return np.concatenate([nums, np.asarray(strs, dtype=object)])


def _sort_tuples(values: np.ndarray[tuple]):
"""
Convert array of tuples (1d) to array or array (2d).
We need to keep the columns separately as they contain different types and
nans (can't use `np.sort` as it may fail when str and nan are mixed in a
column as types cannot be compared).
"""
from pandas.core.internals.construction import to_arrays
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ssche two completely contradictory questions/requests

  1. is there a reasonable way to do this without relying on core.internals?
  2. could pd.MultiIndex.from_tuples de-duplicate some code by using to_arrays?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a reasonable way to do this without relying on core.internals?

My first attempt (1320ff1) wasn't using core.internals, as far as I recall, but it was deemed too complex.

could pd.MultiIndex.from_tuples de-duplicate some code by using to_arrays?

Maybe, what are you trying to achieve?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe, what are you trying to achieve?

Just simplification/de-duplication

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know if this is still relevant (I forgot to respond), but I would argue that from_arrays should be preferred (and from_tuples delegating to from_arrays) as from_arrays could store type information better as it's accepting data in columnar format (instead of row-wise which from_arrays does).

Anyway, I hope you could proceed with your much appreciated simplification/de-dupe efforts...

from pandas.core.sorting import lexsort_indexer

arrays, _ = to_arrays(values, None)
indexer = lexsort_indexer(arrays, orders=True)
return values[indexer]
31 changes: 30 additions & 1 deletion pandas/tests/frame/methods/test_combine_first.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest

import pandas as pd
from pandas import DataFrame, Index, Series
from pandas import DataFrame, Index, MultiIndex, Series
import pandas._testing as tm


Expand Down Expand Up @@ -365,3 +365,32 @@ def test_combine_first_string_dtype_only_na(self):
{"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string"
).set_index(["a", "b"])
tm.assert_frame_equal(result, expected)


def test_combine_first_with_nan_multiindex():
# gh-36562

mi1 = MultiIndex.from_arrays(
[["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"]
)
df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1)
mi2 = MultiIndex.from_arrays(
[["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"]
)
s = Series([1, 2, 3, 4, 5, 6], index=mi2)
res = df.combine_first(DataFrame({"d": s}))
mi_expected = MultiIndex.from_arrays(
[
["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan],
[1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6],
],
names=["a", "b"],
)
expected = DataFrame(
{
"c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1],
"d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan],
},
index=mi_expected,
)
tm.assert_frame_equal(res, expected)
7 changes: 7 additions & 0 deletions pandas/tests/test_sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,3 +453,10 @@ def test_extension_array_codes(self, verify, na_sentinel):
expected_codes = np.array([0, 2, na_sentinel, 1], dtype=np.intp)
tm.assert_extension_array_equal(result, expected_values)
tm.assert_numpy_array_equal(codes, expected_codes)


def test_mixed_str_nan():
values = np.array(["b", np.nan, "a", "b"], dtype=object)
result = safe_sort(values)
expected = np.array([np.nan, "a", "b", "b"], dtype=object)
tm.assert_numpy_array_equal(result, expected)