pandas-dev · jreback · Nov 4, 2019 · Nov 3, 2019
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -14,6 +14,7 @@
 
 from pandas.core.dtypes.cast import (
     construct_1d_object_array_from_listlike,
+    infer_dtype_from_array,
     maybe_promote,
 )
 from pandas.core.dtypes.common import (
@@ -639,8 +640,6 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
         )
 
     if sort and len(uniques) > 0:
-        from pandas.core.sorting import safe_sort
-
         uniques, labels = safe_sort(
             uniques, labels, na_sentinel=na_sentinel, assume_unique=True, verify=False
         )
@@ -1920,3 +1919,138 @@ def diff(arr, n: int, axis: int = 0):
         out_arr = out_arr.astype("int64").view("timedelta64[ns]")
 
     return out_arr
+
+
+# --------------------------------------------------------------------
+# Helper functions
+
+# Note: safe_sort is in algorithms.py instead of sorting.py because it is
+#  low-dependency, is used in this module, and used private methods from
+#  this module.
+def safe_sort(
+    values,
+    labels=None,
+    na_sentinel: int = -1,
+    assume_unique: bool = False,
+    verify: bool = True,
+):
+    """
+    Sort ``values`` and reorder corresponding ``labels``.
+    ``values`` should be unique if ``labels`` is not None.
+    Safe for use with mixed types (int, str), orders ints before strs.
+
+    Parameters
+    ----------
+    values : list-like
+        Sequence; must be unique if ``labels`` is not None.
+    labels : list_like
+        Indices to ``values``. All out of bound indices are treated as
+        "not found" and will be masked with ``na_sentinel``.
+    na_sentinel : int, default -1
+        Value in ``labels`` to mark "not found".
+        Ignored when ``labels`` is None.
+    assume_unique : bool, default False
+        When True, ``values`` are assumed to be unique, which can speed up
+        the calculation. Ignored when ``labels`` is None.
+    verify : bool, default True
+        Check if labels are out of bound for the values and put out of bound
+        labels equal to na_sentinel. If ``verify=False``, it is assumed there
+        are no out of bound labels. Ignored when ``labels`` is None.
+
+        .. versionadded:: 0.25.0
+
+    Returns
+    -------
+    ordered : ndarray
+        Sorted ``values``
+    new_labels : ndarray
+        Reordered ``labels``; returned when ``labels`` is not None.
+
+    Raises
+    ------
+    TypeError
+        * If ``values`` is not list-like or if ``labels`` is neither None
+        nor list-like
+        * If ``values`` cannot be sorted
+    ValueError
+        * If ``labels`` is not None and ``values`` contain duplicates.
+    """
+    if not is_list_like(values):
+        raise TypeError(
+            "Only list-like objects are allowed to be passed to safe_sort as values"
+        )
+
+    if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values):
+        # don't convert to string types
+        dtype, _ = infer_dtype_from_array(values)
+        values = np.asarray(values, dtype=dtype)
+
+    def sort_mixed(values):
+        # order ints before strings, safe in py3
+        str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
+        nums = np.sort(values[~str_pos])
+        strs = np.sort(values[str_pos])
+        return np.concatenate([nums, np.asarray(strs, dtype=object)])
+
+    sorter = None
+    if (
+        not is_extension_array_dtype(values)
+        and lib.infer_dtype(values, skipna=False) == "mixed-integer"
+    ):
+        # unorderable in py3 if mixed str/int
+        ordered = sort_mixed(values)
+    else:
+        try:
+            sorter = values.argsort()
+            ordered = values.take(sorter)
+        except TypeError:
+            # try this anyway
+            ordered = sort_mixed(values)
+
+    # labels:
+
+    if labels is None:
+        return ordered
+
+    if not is_list_like(labels):
+        raise TypeError(
+            "Only list-like objects or None are allowed to be"
+            "passed to safe_sort as labels"
+        )
+    labels = ensure_platform_int(np.asarray(labels))
+
+    from pandas import Index
+
+    if not assume_unique and not Index(values).is_unique:
+        raise ValueError("values should be unique if labels is not None")
+
+    if sorter is None:
+        # mixed types
+        hash_klass, values = _get_data_algo(values)
+        t = hash_klass(len(values))
+        t.map_locations(values)
+        sorter = ensure_platform_int(t.lookup(ordered))
+
+    if na_sentinel == -1:
+        # take_1d is faster, but only works for na_sentinels of -1
+        order2 = sorter.argsort()
+        new_labels = take_1d(order2, labels, fill_value=-1)
+        if verify:
+            mask = (labels < -len(values)) | (labels >= len(values))
+        else:
+            mask = None
+    else:
+        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
+        reverse_indexer.put(sorter, np.arange(len(sorter)))
+        # Out of bound indices will be masked with `na_sentinel` next, so we
+        # may deal with them here without performance loss using `mode='wrap'`
+        new_labels = reverse_indexer.take(labels, mode="wrap")
+
+        mask = labels == na_sentinel
+        if verify:
+            mask = mask | (labels < -len(values)) | (labels >= len(values))
+
+    if mask is not None:
+        np.putmask(new_labels, mask, na_sentinel)
+
+    return ordered, ensure_platform_int(new_labels)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -73,7 +73,6 @@
 import pandas.core.missing as missing
 from pandas.core.ops import get_op_result_name
 from pandas.core.ops.invalid import make_invalid_op
-import pandas.core.sorting as sorting
 from pandas.core.strings import StringMethods
 
 from pandas.io.formats.printing import (
@@ -2507,7 +2506,7 @@ def _union(self, other, sort):
 
             if sort is None:
                 try:
-                    result = sorting.safe_sort(result)
+                    result = algos.safe_sort(result)
                 except TypeError as e:
                     warnings.warn(
                         "{}, sort order is undefined for "
@@ -2603,7 +2602,7 @@ def intersection(self, other, sort=False):
         taken = other.take(indexer)
 
         if sort is None:
-            taken = sorting.safe_sort(taken.values)
+            taken = algos.safe_sort(taken.values)
             if self.name != other.name:
                 name = None
             else:
@@ -2673,7 +2672,7 @@ def difference(self, other, sort=None):
         the_diff = this.values.take(label_diff)
         if sort is None:
             try:
-                the_diff = sorting.safe_sort(the_diff)
+                the_diff = algos.safe_sort(the_diff)
             except TypeError:
                 pass
 
@@ -2750,7 +2749,7 @@ def symmetric_difference(self, other, result_name=None, sort=None):
         the_diff = concat_compat([left_diff, right_diff])
         if sort is None:
             try:
-                the_diff = sorting.safe_sort(the_diff)
+                the_diff = algos.safe_sort(the_diff)
             except TypeError:
                 pass
 

diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -44,7 +44,6 @@
 import pandas.core.common as com
 from pandas.core.frame import _merge_doc
 from pandas.core.internals import _transform_index, concatenate_block_managers
-import pandas.core.sorting as sorting
 from pandas.core.sorting import is_int64_overflow_possible
 
 
@@ -1912,7 +1911,7 @@ def _sort_labels(uniques, left, right):
     llength = len(left)
     labels = np.concatenate([left, right])
 
-    _, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1)
+    _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1)
     new_labels = ensure_int64(new_labels)
     new_left, new_right = new_labels[:llength], new_labels[llength:]
 

diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
@@ -4,13 +4,11 @@
 from pandas._libs import algos, hashtable, lib
 from pandas._libs.hashtable import unique_label_indices
 
-from pandas.core.dtypes.cast import infer_dtype_from_array
 from pandas.core.dtypes.common import (
     ensure_int64,
     ensure_platform_int,
     is_categorical_dtype,
     is_extension_array_dtype,
-    is_list_like,
 )
 from pandas.core.dtypes.missing import isna
 
@@ -389,132 +387,3 @@ def _reorder_by_uniques(uniques, labels):
     uniques = algorithms.take_nd(uniques, sorter, allow_fill=False)
 
     return uniques, labels
-
-
-def safe_sort(
-    values,
-    labels=None,
-    na_sentinel: int = -1,
-    assume_unique: bool = False,
-    verify: bool = True,
-):
-    """
-    Sort ``values`` and reorder corresponding ``labels``.
-    ``values`` should be unique if ``labels`` is not None.
-    Safe for use with mixed types (int, str), orders ints before strs.
-
-    Parameters
-    ----------
-    values : list-like
-        Sequence; must be unique if ``labels`` is not None.
-    labels : list_like
-        Indices to ``values``. All out of bound indices are treated as
-        "not found" and will be masked with ``na_sentinel``.
-    na_sentinel : int, default -1
-        Value in ``labels`` to mark "not found".
-        Ignored when ``labels`` is None.
-    assume_unique : bool, default False
-        When True, ``values`` are assumed to be unique, which can speed up
-        the calculation. Ignored when ``labels`` is None.
-    verify : bool, default True
-        Check if labels are out of bound for the values and put out of bound
-        labels equal to na_sentinel. If ``verify=False``, it is assumed there
-        are no out of bound labels. Ignored when ``labels`` is None.
-
-        .. versionadded:: 0.25.0
-
-    Returns
-    -------
-    ordered : ndarray
-        Sorted ``values``
-    new_labels : ndarray
-        Reordered ``labels``; returned when ``labels`` is not None.
-
-    Raises
-    ------
-    TypeError
-        * If ``values`` is not list-like or if ``labels`` is neither None
-        nor list-like
-        * If ``values`` cannot be sorted
-    ValueError
-        * If ``labels`` is not None and ``values`` contain duplicates.
-    """
-    if not is_list_like(values):
-        raise TypeError(
-            "Only list-like objects are allowed to be passed to safe_sort as values"
-        )
-
-    if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values):
-        # don't convert to string types
-        dtype, _ = infer_dtype_from_array(values)
-        values = np.asarray(values, dtype=dtype)
-
-    def sort_mixed(values):
-        # order ints before strings, safe in py3
-        str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
-        nums = np.sort(values[~str_pos])
-        strs = np.sort(values[str_pos])
-        return np.concatenate([nums, np.asarray(strs, dtype=object)])
-
-    sorter = None
-    if (
-        not is_extension_array_dtype(values)
-        and lib.infer_dtype(values, skipna=False) == "mixed-integer"
-    ):
-        # unorderable in py3 if mixed str/int
-        ordered = sort_mixed(values)
-    else:
-        try:
-            sorter = values.argsort()
-            ordered = values.take(sorter)
-        except TypeError:
-            # try this anyway
-            ordered = sort_mixed(values)
-
-    # labels:
-
-    if labels is None:
-        return ordered
-
-    if not is_list_like(labels):
-        raise TypeError(
-            "Only list-like objects or None are allowed to be"
-            "passed to safe_sort as labels"
-        )
-    labels = ensure_platform_int(np.asarray(labels))
-
-    from pandas import Index
-
-    if not assume_unique and not Index(values).is_unique:
-        raise ValueError("values should be unique if labels is not None")
-
-    if sorter is None:
-        # mixed types
-        hash_klass, values = algorithms._get_data_algo(values)
-        t = hash_klass(len(values))
-        t.map_locations(values)
-        sorter = ensure_platform_int(t.lookup(ordered))
-
-    if na_sentinel == -1:
-        # take_1d is faster, but only works for na_sentinels of -1
-        order2 = sorter.argsort()
-        new_labels = algorithms.take_1d(order2, labels, fill_value=-1)
-        if verify:
-            mask = (labels < -len(values)) | (labels >= len(values))
-        else:
-            mask = None
-    else:
-        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
-        reverse_indexer.put(sorter, np.arange(len(sorter)))
-        # Out of bound indices will be masked with `na_sentinel` next, so we
-        # may deal with them here without performance loss using `mode='wrap'`
-        new_labels = reverse_indexer.take(labels, mode="wrap")
-
-        mask = labels == na_sentinel
-        if verify:
-            mask = mask | (labels < -len(values)) | (labels >= len(values))
-
-    if mask is not None:
-        np.putmask(new_labels, mask, na_sentinel)
-
-    return ordered, ensure_platform_int(new_labels)
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
@@ -33,13 +33,13 @@
     isna,
     period_range,
 )
+from pandas.core.algorithms import safe_sort
 from pandas.core.index import (
     _get_combined_index,
     ensure_index,
     ensure_index_from_sequences,
 )
 from pandas.core.indexes.api import Index, MultiIndex
-from pandas.core.sorting import safe_sort
 from pandas.tests.indexes.common import Base
 from pandas.tests.indexes.conftest import indices_dict
 import pandas.util.testing as tm

diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -26,7 +26,6 @@
 import pandas.core.algorithms as algos
 from pandas.core.arrays import DatetimeArray
 import pandas.core.common as com
-from pandas.core.sorting import safe_sort
 import pandas.util.testing as tm
 
 
@@ -309,7 +308,7 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):
         labels, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel)
         if sort:
             expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp)
-            expected_uniques = safe_sort(uniques)
+            expected_uniques = algos.safe_sort(uniques)
         else:
             expected_labels = np.array([0, 1, na_sentinel, 0], dtype=np.intp)
             expected_uniques = uniques