From 40c01f8218042be07e34db6deea679b715513495 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 3 Nov 2019 09:14:15 -0800 Subject: [PATCH] REF: move safe_sort to algos to avoid private/circular dependencies --- pandas/core/algorithms.py | 138 ++++++++++++++++++++++++++- pandas/core/indexes/base.py | 9 +- pandas/core/reshape/merge.py | 3 +- pandas/core/sorting.py | 131 ------------------------- pandas/tests/indexes/test_base.py | 2 +- pandas/tests/test_algos.py | 3 +- pandas/tests/test_sorting.py | 2 +- pandas/tests/window/test_pairwise.py | 2 +- 8 files changed, 145 insertions(+), 145 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 06ba2a7e0ccfb..fc55bfbae0900 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -14,6 +14,7 @@ from pandas.core.dtypes.cast import ( construct_1d_object_array_from_listlike, + infer_dtype_from_array, maybe_promote, ) from pandas.core.dtypes.common import ( @@ -639,8 +640,6 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint= ) if sort and len(uniques) > 0: - from pandas.core.sorting import safe_sort - uniques, labels = safe_sort( uniques, labels, na_sentinel=na_sentinel, assume_unique=True, verify=False ) @@ -1920,3 +1919,138 @@ def diff(arr, n: int, axis: int = 0): out_arr = out_arr.astype("int64").view("timedelta64[ns]") return out_arr + + +# -------------------------------------------------------------------- +# Helper functions + +# Note: safe_sort is in algorithms.py instead of sorting.py because it is +# low-dependency, is used in this module, and used private methods from +# this module. +def safe_sort( + values, + labels=None, + na_sentinel: int = -1, + assume_unique: bool = False, + verify: bool = True, +): + """ + Sort ``values`` and reorder corresponding ``labels``. + ``values`` should be unique if ``labels`` is not None. + Safe for use with mixed types (int, str), orders ints before strs. + + Parameters + ---------- + values : list-like + Sequence; must be unique if ``labels`` is not None. + labels : list_like + Indices to ``values``. All out of bound indices are treated as + "not found" and will be masked with ``na_sentinel``. + na_sentinel : int, default -1 + Value in ``labels`` to mark "not found". + Ignored when ``labels`` is None. + assume_unique : bool, default False + When True, ``values`` are assumed to be unique, which can speed up + the calculation. Ignored when ``labels`` is None. + verify : bool, default True + Check if labels are out of bound for the values and put out of bound + labels equal to na_sentinel. If ``verify=False``, it is assumed there + are no out of bound labels. Ignored when ``labels`` is None. + + .. versionadded:: 0.25.0 + + Returns + ------- + ordered : ndarray + Sorted ``values`` + new_labels : ndarray + Reordered ``labels``; returned when ``labels`` is not None. + + Raises + ------ + TypeError + * If ``values`` is not list-like or if ``labels`` is neither None + nor list-like + * If ``values`` cannot be sorted + ValueError + * If ``labels`` is not None and ``values`` contain duplicates. + """ + if not is_list_like(values): + raise TypeError( + "Only list-like objects are allowed to be passed to safe_sort as values" + ) + + if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values): + # don't convert to string types + dtype, _ = infer_dtype_from_array(values) + values = np.asarray(values, dtype=dtype) + + def sort_mixed(values): + # order ints before strings, safe in py3 + str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) + nums = np.sort(values[~str_pos]) + strs = np.sort(values[str_pos]) + return np.concatenate([nums, np.asarray(strs, dtype=object)]) + + sorter = None + if ( + not is_extension_array_dtype(values) + and lib.infer_dtype(values, skipna=False) == "mixed-integer" + ): + # unorderable in py3 if mixed str/int + ordered = sort_mixed(values) + else: + try: + sorter = values.argsort() + ordered = values.take(sorter) + except TypeError: + # try this anyway + ordered = sort_mixed(values) + + # labels: + + if labels is None: + return ordered + + if not is_list_like(labels): + raise TypeError( + "Only list-like objects or None are allowed to be" + "passed to safe_sort as labels" + ) + labels = ensure_platform_int(np.asarray(labels)) + + from pandas import Index + + if not assume_unique and not Index(values).is_unique: + raise ValueError("values should be unique if labels is not None") + + if sorter is None: + # mixed types + hash_klass, values = _get_data_algo(values) + t = hash_klass(len(values)) + t.map_locations(values) + sorter = ensure_platform_int(t.lookup(ordered)) + + if na_sentinel == -1: + # take_1d is faster, but only works for na_sentinels of -1 + order2 = sorter.argsort() + new_labels = take_1d(order2, labels, fill_value=-1) + if verify: + mask = (labels < -len(values)) | (labels >= len(values)) + else: + mask = None + else: + reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer.put(sorter, np.arange(len(sorter))) + # Out of bound indices will be masked with `na_sentinel` next, so we + # may deal with them here without performance loss using `mode='wrap'` + new_labels = reverse_indexer.take(labels, mode="wrap") + + mask = labels == na_sentinel + if verify: + mask = mask | (labels < -len(values)) | (labels >= len(values)) + + if mask is not None: + np.putmask(new_labels, mask, na_sentinel) + + return ordered, ensure_platform_int(new_labels) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 187c7e2f3a7f7..4c5b7442337fb 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -73,7 +73,6 @@ import pandas.core.missing as missing from pandas.core.ops import get_op_result_name from pandas.core.ops.invalid import make_invalid_op -import pandas.core.sorting as sorting from pandas.core.strings import StringMethods from pandas.io.formats.printing import ( @@ -2507,7 +2506,7 @@ def _union(self, other, sort): if sort is None: try: - result = sorting.safe_sort(result) + result = algos.safe_sort(result) except TypeError as e: warnings.warn( "{}, sort order is undefined for " @@ -2603,7 +2602,7 @@ def intersection(self, other, sort=False): taken = other.take(indexer) if sort is None: - taken = sorting.safe_sort(taken.values) + taken = algos.safe_sort(taken.values) if self.name != other.name: name = None else: @@ -2673,7 +2672,7 @@ def difference(self, other, sort=None): the_diff = this.values.take(label_diff) if sort is None: try: - the_diff = sorting.safe_sort(the_diff) + the_diff = algos.safe_sort(the_diff) except TypeError: pass @@ -2750,7 +2749,7 @@ def symmetric_difference(self, other, result_name=None, sort=None): the_diff = concat_compat([left_diff, right_diff]) if sort is None: try: - the_diff = sorting.safe_sort(the_diff) + the_diff = algos.safe_sort(the_diff) except TypeError: pass diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index ea334503a4302..9845c570ca704 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -44,7 +44,6 @@ import pandas.core.common as com from pandas.core.frame import _merge_doc from pandas.core.internals import _transform_index, concatenate_block_managers -import pandas.core.sorting as sorting from pandas.core.sorting import is_int64_overflow_possible @@ -1912,7 +1911,7 @@ def _sort_labels(uniques, left, right): llength = len(left) labels = np.concatenate([left, right]) - _, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1) + _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) new_labels = ensure_int64(new_labels) new_left, new_right = new_labels[:llength], new_labels[llength:] diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 9b8a1a76e419c..82eb93dd4c879 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -4,13 +4,11 @@ from pandas._libs import algos, hashtable, lib from pandas._libs.hashtable import unique_label_indices -from pandas.core.dtypes.cast import infer_dtype_from_array from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, is_categorical_dtype, is_extension_array_dtype, - is_list_like, ) from pandas.core.dtypes.missing import isna @@ -389,132 +387,3 @@ def _reorder_by_uniques(uniques, labels): uniques = algorithms.take_nd(uniques, sorter, allow_fill=False) return uniques, labels - - -def safe_sort( - values, - labels=None, - na_sentinel: int = -1, - assume_unique: bool = False, - verify: bool = True, -): - """ - Sort ``values`` and reorder corresponding ``labels``. - ``values`` should be unique if ``labels`` is not None. - Safe for use with mixed types (int, str), orders ints before strs. - - Parameters - ---------- - values : list-like - Sequence; must be unique if ``labels`` is not None. - labels : list_like - Indices to ``values``. All out of bound indices are treated as - "not found" and will be masked with ``na_sentinel``. - na_sentinel : int, default -1 - Value in ``labels`` to mark "not found". - Ignored when ``labels`` is None. - assume_unique : bool, default False - When True, ``values`` are assumed to be unique, which can speed up - the calculation. Ignored when ``labels`` is None. - verify : bool, default True - Check if labels are out of bound for the values and put out of bound - labels equal to na_sentinel. If ``verify=False``, it is assumed there - are no out of bound labels. Ignored when ``labels`` is None. - - .. versionadded:: 0.25.0 - - Returns - ------- - ordered : ndarray - Sorted ``values`` - new_labels : ndarray - Reordered ``labels``; returned when ``labels`` is not None. - - Raises - ------ - TypeError - * If ``values`` is not list-like or if ``labels`` is neither None - nor list-like - * If ``values`` cannot be sorted - ValueError - * If ``labels`` is not None and ``values`` contain duplicates. - """ - if not is_list_like(values): - raise TypeError( - "Only list-like objects are allowed to be passed to safe_sort as values" - ) - - if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values): - # don't convert to string types - dtype, _ = infer_dtype_from_array(values) - values = np.asarray(values, dtype=dtype) - - def sort_mixed(values): - # order ints before strings, safe in py3 - str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) - nums = np.sort(values[~str_pos]) - strs = np.sort(values[str_pos]) - return np.concatenate([nums, np.asarray(strs, dtype=object)]) - - sorter = None - if ( - not is_extension_array_dtype(values) - and lib.infer_dtype(values, skipna=False) == "mixed-integer" - ): - # unorderable in py3 if mixed str/int - ordered = sort_mixed(values) - else: - try: - sorter = values.argsort() - ordered = values.take(sorter) - except TypeError: - # try this anyway - ordered = sort_mixed(values) - - # labels: - - if labels is None: - return ordered - - if not is_list_like(labels): - raise TypeError( - "Only list-like objects or None are allowed to be" - "passed to safe_sort as labels" - ) - labels = ensure_platform_int(np.asarray(labels)) - - from pandas import Index - - if not assume_unique and not Index(values).is_unique: - raise ValueError("values should be unique if labels is not None") - - if sorter is None: - # mixed types - hash_klass, values = algorithms._get_data_algo(values) - t = hash_klass(len(values)) - t.map_locations(values) - sorter = ensure_platform_int(t.lookup(ordered)) - - if na_sentinel == -1: - # take_1d is faster, but only works for na_sentinels of -1 - order2 = sorter.argsort() - new_labels = algorithms.take_1d(order2, labels, fill_value=-1) - if verify: - mask = (labels < -len(values)) | (labels >= len(values)) - else: - mask = None - else: - reverse_indexer = np.empty(len(sorter), dtype=np.int_) - reverse_indexer.put(sorter, np.arange(len(sorter))) - # Out of bound indices will be masked with `na_sentinel` next, so we - # may deal with them here without performance loss using `mode='wrap'` - new_labels = reverse_indexer.take(labels, mode="wrap") - - mask = labels == na_sentinel - if verify: - mask = mask | (labels < -len(values)) | (labels >= len(values)) - - if mask is not None: - np.putmask(new_labels, mask, na_sentinel) - - return ordered, ensure_platform_int(new_labels) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 8d0cb0edf51df..e43d340a46d9f 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -33,13 +33,13 @@ isna, period_range, ) +from pandas.core.algorithms import safe_sort from pandas.core.index import ( _get_combined_index, ensure_index, ensure_index_from_sequences, ) from pandas.core.indexes.api import Index, MultiIndex -from pandas.core.sorting import safe_sort from pandas.tests.indexes.common import Base from pandas.tests.indexes.conftest import indices_dict import pandas.util.testing as tm diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 9dd88fd5dd25b..a64501040442d 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -26,7 +26,6 @@ import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray import pandas.core.common as com -from pandas.core.sorting import safe_sort import pandas.util.testing as tm @@ -309,7 +308,7 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): labels, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel) if sort: expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp) - expected_uniques = safe_sort(uniques) + expected_uniques = algos.safe_sort(uniques) else: expected_labels = np.array([0, 1, na_sentinel, 0], dtype=np.intp) expected_uniques = uniques diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index b86aaa0ed7e1f..5d7eb70817a11 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -6,6 +6,7 @@ import pytest from pandas import DataFrame, MultiIndex, Series, array, concat, merge +from pandas.core.algorithms import safe_sort import pandas.core.common as com from pandas.core.sorting import ( decons_group_index, @@ -13,7 +14,6 @@ is_int64_overflow_possible, lexsort_indexer, nargsort, - safe_sort, ) import pandas.util.testing as tm diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index 56d89e15c418c..6f6d4c09526ff 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -3,7 +3,7 @@ import pytest from pandas import DataFrame, Series -from pandas.core.sorting import safe_sort +from pandas.core.algorithms import safe_sort import pandas.util.testing as tm