diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 80bf7214e7a27..7f684711b3042 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -616,7 +616,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): na_value=na_value) if sort and len(uniques) > 0: - from pandas.core.sorting import safe_sort + from pandas.core.sorting import safe_sort, SortError if na_sentinel == -1: # GH-25409 take_1d only works for na_sentinels of -1 try: @@ -626,13 +626,19 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): uniques = uniques.take(order) except TypeError: # Mixed types, where uniques.argsort fails. + try: + uniques, labels = safe_sort(uniques, labels, + na_sentinel=na_sentinel, + assume_unique=True) + except SortError as e: + raise TypeError(e) from e + else: + try: uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) - else: - uniques, labels = safe_sort(uniques, labels, - na_sentinel=na_sentinel, - assume_unique=True) + except SortError as e: + raise TypeError(e) from e uniques = _reconstruct_data(uniques, dtype, original) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a43e1b3007e2b..8409bae7d5e54 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -43,6 +43,7 @@ import pandas.core.missing as missing from pandas.core.ops import get_op_result_name, make_invalid_op import pandas.core.sorting as sorting +from pandas.core.sorting import SortError from pandas.core.strings import StringMethods from pandas.io.formats.printing import ( @@ -2345,7 +2346,7 @@ def union(self, other, sort=None): if sort is None: try: result = sorting.safe_sort(result) - except TypeError as e: + except SortError as e: warnings.warn("{}, sort order is undefined for " "incomparable objects".format(e), RuntimeWarning, stacklevel=3) @@ -2432,7 +2433,10 @@ def intersection(self, other, sort=False): taken = other.take(indexer) if sort is None: - taken = sorting.safe_sort(taken.values) + try: + taken = sorting.safe_sort(taken.values) + except sorting.SortError as e: + raise TypeError(e) from e if self.name != other.name: name = None else: @@ -2504,7 +2508,7 @@ def difference(self, other, sort=None): if sort is None: try: the_diff = sorting.safe_sort(the_diff) - except TypeError: + except SortError: pass return this._shallow_copy(the_diff, name=result_name, freq=None) @@ -2580,7 +2584,7 @@ def symmetric_difference(self, other, result_name=None, sort=None): if sort is None: try: the_diff = sorting.safe_sort(the_diff) - except TypeError: + except SortError: pass attribs = self._get_attributes_dict() diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index cd5c853c6efe4..edbea5696277c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1738,7 +1738,10 @@ def _sort_labels(uniques, left, right): llength = len(left) labels = np.concatenate([left, right]) - _, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1) + try: + _, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1) + except sorting.SortError as e: + raise TypeError(e) from e new_labels = ensure_int64(new_labels) new_left, new_right = new_labels[:llength], new_labels[llength:] diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index e1cf5b76ba05b..e8c0d6e75b7d4 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -17,6 +17,13 @@ _INT64_MAX = np.iinfo(np.int64).max +class SortError(TypeError): + """ + Error raised when problems arise during sorting due to problems + with input data. Subclass of `TypeError`. + """ + + def get_group_index(labels, shape, sort, xnull): """ For the particular label_list, gets the offsets into the hypothetical list @@ -437,8 +444,9 @@ def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): ------ TypeError * If ``values`` is not list-like or if ``labels`` is neither None - nor list-like - * If ``values`` cannot be sorted + nor list-like. + SortError + * If ``values`` cannot be sorted. ValueError * If ``labels`` is not None and ``values`` contain duplicates. """ @@ -456,8 +464,11 @@ def sort_mixed(values): # order ints before strings, safe in py3 str_pos = np.array([isinstance(x, string_types) for x in values], dtype=bool) - nums = np.sort(values[~str_pos]) - strs = np.sort(values[str_pos]) + try: + nums = np.sort(values[~str_pos]) + strs = np.sort(values[str_pos]) + except TypeError as e: + raise SortError(e) from e return np.concatenate([nums, np.asarray(strs, dtype=object)]) sorter = None diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b64786de264cd..6f4f0f0fa28e4 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -24,6 +24,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray import pandas.core.common as com +from pandas.core.sorting import SortError import pandas.util.testing as tm from pandas.util.testing import assert_almost_equal @@ -228,11 +229,9 @@ def test_complex_sorting(self): # gh 12666 - check no segfault x17 = np.array([complex(i) for i in range(17)], dtype=object) - msg = ("unorderable types: .* [<>] .*" - "|" # the above case happens for numpy < 1.14 - "'[<>]' not supported between instances of .*") - with pytest.raises(TypeError, match=msg): + with pytest.raises(TypeError, match="complex") as excinfo: algos.factorize(x17[::-1], sort=True) + assert type(excinfo.value.__cause__) == SortError def test_float64_factorize(self, writable): data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64) diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 04a50cf6facd5..3adb8f3aba0eb 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -10,7 +10,7 @@ DataFrame, MultiIndex, Series, compat, concat, merge, to_datetime) from pandas.core import common as com from pandas.core.sorting import ( - decons_group_index, get_group_index, is_int64_overflow_possible, + SortError, decons_group_index, get_group_index, is_int64_overflow_possible, lexsort_indexer, nargsort, safe_sort) from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -413,10 +413,8 @@ def test_mixed_integer_from_list(self): def test_unsortable(self): # GH 13714 arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object) - msg = ("unorderable types: .* [<>] .*" - "|" # the above case happens for numpy < 1.14 - "'[<>]' not supported between instances of .*") - with pytest.raises(TypeError, match=msg): + msg = "int.*datetime|datetime.*int" + with pytest.raises(SortError, match=msg): safe_sort(arr) def test_exceptions(self):