Skip to content

Commit 11476cb

Browse files
jbrockmendeljreback
authored andcommitted
REF: move safe_sort to algos to avoid private/circular dependencies (#29384)
1 parent 8af8949 commit 11476cb

File tree

8 files changed

+145
-145
lines changed

8 files changed

+145
-145
lines changed

pandas/core/algorithms.py

+136-2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from pandas.core.dtypes.cast import (
1616
construct_1d_object_array_from_listlike,
17+
infer_dtype_from_array,
1718
maybe_promote,
1819
)
1920
from pandas.core.dtypes.common import (
@@ -639,8 +640,6 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
639640
)
640641

641642
if sort and len(uniques) > 0:
642-
from pandas.core.sorting import safe_sort
643-
644643
uniques, labels = safe_sort(
645644
uniques, labels, na_sentinel=na_sentinel, assume_unique=True, verify=False
646645
)
@@ -1910,3 +1909,138 @@ def diff(arr, n: int, axis: int = 0):
19101909
out_arr = out_arr.astype("int64").view("timedelta64[ns]")
19111910

19121911
return out_arr
1912+
1913+
1914+
# --------------------------------------------------------------------
1915+
# Helper functions
1916+
1917+
# Note: safe_sort is in algorithms.py instead of sorting.py because it is
1918+
# low-dependency, is used in this module, and used private methods from
1919+
# this module.
1920+
def safe_sort(
1921+
values,
1922+
labels=None,
1923+
na_sentinel: int = -1,
1924+
assume_unique: bool = False,
1925+
verify: bool = True,
1926+
):
1927+
"""
1928+
Sort ``values`` and reorder corresponding ``labels``.
1929+
``values`` should be unique if ``labels`` is not None.
1930+
Safe for use with mixed types (int, str), orders ints before strs.
1931+
1932+
Parameters
1933+
----------
1934+
values : list-like
1935+
Sequence; must be unique if ``labels`` is not None.
1936+
labels : list_like
1937+
Indices to ``values``. All out of bound indices are treated as
1938+
"not found" and will be masked with ``na_sentinel``.
1939+
na_sentinel : int, default -1
1940+
Value in ``labels`` to mark "not found".
1941+
Ignored when ``labels`` is None.
1942+
assume_unique : bool, default False
1943+
When True, ``values`` are assumed to be unique, which can speed up
1944+
the calculation. Ignored when ``labels`` is None.
1945+
verify : bool, default True
1946+
Check if labels are out of bound for the values and put out of bound
1947+
labels equal to na_sentinel. If ``verify=False``, it is assumed there
1948+
are no out of bound labels. Ignored when ``labels`` is None.
1949+
1950+
.. versionadded:: 0.25.0
1951+
1952+
Returns
1953+
-------
1954+
ordered : ndarray
1955+
Sorted ``values``
1956+
new_labels : ndarray
1957+
Reordered ``labels``; returned when ``labels`` is not None.
1958+
1959+
Raises
1960+
------
1961+
TypeError
1962+
* If ``values`` is not list-like or if ``labels`` is neither None
1963+
nor list-like
1964+
* If ``values`` cannot be sorted
1965+
ValueError
1966+
* If ``labels`` is not None and ``values`` contain duplicates.
1967+
"""
1968+
if not is_list_like(values):
1969+
raise TypeError(
1970+
"Only list-like objects are allowed to be passed to safe_sort as values"
1971+
)
1972+
1973+
if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values):
1974+
# don't convert to string types
1975+
dtype, _ = infer_dtype_from_array(values)
1976+
values = np.asarray(values, dtype=dtype)
1977+
1978+
def sort_mixed(values):
1979+
# order ints before strings, safe in py3
1980+
str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
1981+
nums = np.sort(values[~str_pos])
1982+
strs = np.sort(values[str_pos])
1983+
return np.concatenate([nums, np.asarray(strs, dtype=object)])
1984+
1985+
sorter = None
1986+
if (
1987+
not is_extension_array_dtype(values)
1988+
and lib.infer_dtype(values, skipna=False) == "mixed-integer"
1989+
):
1990+
# unorderable in py3 if mixed str/int
1991+
ordered = sort_mixed(values)
1992+
else:
1993+
try:
1994+
sorter = values.argsort()
1995+
ordered = values.take(sorter)
1996+
except TypeError:
1997+
# try this anyway
1998+
ordered = sort_mixed(values)
1999+
2000+
# labels:
2001+
2002+
if labels is None:
2003+
return ordered
2004+
2005+
if not is_list_like(labels):
2006+
raise TypeError(
2007+
"Only list-like objects or None are allowed to be"
2008+
"passed to safe_sort as labels"
2009+
)
2010+
labels = ensure_platform_int(np.asarray(labels))
2011+
2012+
from pandas import Index
2013+
2014+
if not assume_unique and not Index(values).is_unique:
2015+
raise ValueError("values should be unique if labels is not None")
2016+
2017+
if sorter is None:
2018+
# mixed types
2019+
hash_klass, values = _get_data_algo(values)
2020+
t = hash_klass(len(values))
2021+
t.map_locations(values)
2022+
sorter = ensure_platform_int(t.lookup(ordered))
2023+
2024+
if na_sentinel == -1:
2025+
# take_1d is faster, but only works for na_sentinels of -1
2026+
order2 = sorter.argsort()
2027+
new_labels = take_1d(order2, labels, fill_value=-1)
2028+
if verify:
2029+
mask = (labels < -len(values)) | (labels >= len(values))
2030+
else:
2031+
mask = None
2032+
else:
2033+
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
2034+
reverse_indexer.put(sorter, np.arange(len(sorter)))
2035+
# Out of bound indices will be masked with `na_sentinel` next, so we
2036+
# may deal with them here without performance loss using `mode='wrap'`
2037+
new_labels = reverse_indexer.take(labels, mode="wrap")
2038+
2039+
mask = labels == na_sentinel
2040+
if verify:
2041+
mask = mask | (labels < -len(values)) | (labels >= len(values))
2042+
2043+
if mask is not None:
2044+
np.putmask(new_labels, mask, na_sentinel)
2045+
2046+
return ordered, ensure_platform_int(new_labels)

pandas/core/indexes/base.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@
7373
import pandas.core.missing as missing
7474
from pandas.core.ops import get_op_result_name
7575
from pandas.core.ops.invalid import make_invalid_op
76-
import pandas.core.sorting as sorting
7776
from pandas.core.strings import StringMethods
7877

7978
from pandas.io.formats.printing import (
@@ -2504,7 +2503,7 @@ def _union(self, other, sort):
25042503

25052504
if sort is None:
25062505
try:
2507-
result = sorting.safe_sort(result)
2506+
result = algos.safe_sort(result)
25082507
except TypeError as e:
25092508
warnings.warn(
25102509
"{}, sort order is undefined for "
@@ -2600,7 +2599,7 @@ def intersection(self, other, sort=False):
26002599
taken = other.take(indexer)
26012600

26022601
if sort is None:
2603-
taken = sorting.safe_sort(taken.values)
2602+
taken = algos.safe_sort(taken.values)
26042603
if self.name != other.name:
26052604
name = None
26062605
else:
@@ -2670,7 +2669,7 @@ def difference(self, other, sort=None):
26702669
the_diff = this.values.take(label_diff)
26712670
if sort is None:
26722671
try:
2673-
the_diff = sorting.safe_sort(the_diff)
2672+
the_diff = algos.safe_sort(the_diff)
26742673
except TypeError:
26752674
pass
26762675

@@ -2747,7 +2746,7 @@ def symmetric_difference(self, other, result_name=None, sort=None):
27472746
the_diff = concat_compat([left_diff, right_diff])
27482747
if sort is None:
27492748
try:
2750-
the_diff = sorting.safe_sort(the_diff)
2749+
the_diff = algos.safe_sort(the_diff)
27512750
except TypeError:
27522751
pass
27532752

pandas/core/reshape/merge.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@
4444
import pandas.core.common as com
4545
from pandas.core.frame import _merge_doc
4646
from pandas.core.internals import _transform_index, concatenate_block_managers
47-
import pandas.core.sorting as sorting
4847
from pandas.core.sorting import is_int64_overflow_possible
4948

5049

@@ -1912,7 +1911,7 @@ def _sort_labels(uniques, left, right):
19121911
llength = len(left)
19131912
labels = np.concatenate([left, right])
19141913

1915-
_, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1)
1914+
_, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1)
19161915
new_labels = ensure_int64(new_labels)
19171916
new_left, new_right = new_labels[:llength], new_labels[llength:]
19181917

pandas/core/sorting.py

-131
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,11 @@
44
from pandas._libs import algos, hashtable, lib
55
from pandas._libs.hashtable import unique_label_indices
66

7-
from pandas.core.dtypes.cast import infer_dtype_from_array
87
from pandas.core.dtypes.common import (
98
ensure_int64,
109
ensure_platform_int,
1110
is_categorical_dtype,
1211
is_extension_array_dtype,
13-
is_list_like,
1412
)
1513
from pandas.core.dtypes.missing import isna
1614

@@ -389,132 +387,3 @@ def _reorder_by_uniques(uniques, labels):
389387
uniques = algorithms.take_nd(uniques, sorter, allow_fill=False)
390388

391389
return uniques, labels
392-
393-
394-
def safe_sort(
395-
values,
396-
labels=None,
397-
na_sentinel: int = -1,
398-
assume_unique: bool = False,
399-
verify: bool = True,
400-
):
401-
"""
402-
Sort ``values`` and reorder corresponding ``labels``.
403-
``values`` should be unique if ``labels`` is not None.
404-
Safe for use with mixed types (int, str), orders ints before strs.
405-
406-
Parameters
407-
----------
408-
values : list-like
409-
Sequence; must be unique if ``labels`` is not None.
410-
labels : list_like
411-
Indices to ``values``. All out of bound indices are treated as
412-
"not found" and will be masked with ``na_sentinel``.
413-
na_sentinel : int, default -1
414-
Value in ``labels`` to mark "not found".
415-
Ignored when ``labels`` is None.
416-
assume_unique : bool, default False
417-
When True, ``values`` are assumed to be unique, which can speed up
418-
the calculation. Ignored when ``labels`` is None.
419-
verify : bool, default True
420-
Check if labels are out of bound for the values and put out of bound
421-
labels equal to na_sentinel. If ``verify=False``, it is assumed there
422-
are no out of bound labels. Ignored when ``labels`` is None.
423-
424-
.. versionadded:: 0.25.0
425-
426-
Returns
427-
-------
428-
ordered : ndarray
429-
Sorted ``values``
430-
new_labels : ndarray
431-
Reordered ``labels``; returned when ``labels`` is not None.
432-
433-
Raises
434-
------
435-
TypeError
436-
* If ``values`` is not list-like or if ``labels`` is neither None
437-
nor list-like
438-
* If ``values`` cannot be sorted
439-
ValueError
440-
* If ``labels`` is not None and ``values`` contain duplicates.
441-
"""
442-
if not is_list_like(values):
443-
raise TypeError(
444-
"Only list-like objects are allowed to be passed to safe_sort as values"
445-
)
446-
447-
if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values):
448-
# don't convert to string types
449-
dtype, _ = infer_dtype_from_array(values)
450-
values = np.asarray(values, dtype=dtype)
451-
452-
def sort_mixed(values):
453-
# order ints before strings, safe in py3
454-
str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
455-
nums = np.sort(values[~str_pos])
456-
strs = np.sort(values[str_pos])
457-
return np.concatenate([nums, np.asarray(strs, dtype=object)])
458-
459-
sorter = None
460-
if (
461-
not is_extension_array_dtype(values)
462-
and lib.infer_dtype(values, skipna=False) == "mixed-integer"
463-
):
464-
# unorderable in py3 if mixed str/int
465-
ordered = sort_mixed(values)
466-
else:
467-
try:
468-
sorter = values.argsort()
469-
ordered = values.take(sorter)
470-
except TypeError:
471-
# try this anyway
472-
ordered = sort_mixed(values)
473-
474-
# labels:
475-
476-
if labels is None:
477-
return ordered
478-
479-
if not is_list_like(labels):
480-
raise TypeError(
481-
"Only list-like objects or None are allowed to be"
482-
"passed to safe_sort as labels"
483-
)
484-
labels = ensure_platform_int(np.asarray(labels))
485-
486-
from pandas import Index
487-
488-
if not assume_unique and not Index(values).is_unique:
489-
raise ValueError("values should be unique if labels is not None")
490-
491-
if sorter is None:
492-
# mixed types
493-
hash_klass, values = algorithms._get_data_algo(values)
494-
t = hash_klass(len(values))
495-
t.map_locations(values)
496-
sorter = ensure_platform_int(t.lookup(ordered))
497-
498-
if na_sentinel == -1:
499-
# take_1d is faster, but only works for na_sentinels of -1
500-
order2 = sorter.argsort()
501-
new_labels = algorithms.take_1d(order2, labels, fill_value=-1)
502-
if verify:
503-
mask = (labels < -len(values)) | (labels >= len(values))
504-
else:
505-
mask = None
506-
else:
507-
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
508-
reverse_indexer.put(sorter, np.arange(len(sorter)))
509-
# Out of bound indices will be masked with `na_sentinel` next, so we
510-
# may deal with them here without performance loss using `mode='wrap'`
511-
new_labels = reverse_indexer.take(labels, mode="wrap")
512-
513-
mask = labels == na_sentinel
514-
if verify:
515-
mask = mask | (labels < -len(values)) | (labels >= len(values))
516-
517-
if mask is not None:
518-
np.putmask(new_labels, mask, na_sentinel)
519-
520-
return ordered, ensure_platform_int(new_labels)

pandas/tests/indexes/test_base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,13 @@
3333
isna,
3434
period_range,
3535
)
36+
from pandas.core.algorithms import safe_sort
3637
from pandas.core.index import (
3738
_get_combined_index,
3839
ensure_index,
3940
ensure_index_from_sequences,
4041
)
4142
from pandas.core.indexes.api import Index, MultiIndex
42-
from pandas.core.sorting import safe_sort
4343
from pandas.tests.indexes.common import Base
4444
from pandas.tests.indexes.conftest import indices_dict
4545
import pandas.util.testing as tm

pandas/tests/test_algos.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
import pandas.core.algorithms as algos
2727
from pandas.core.arrays import DatetimeArray
2828
import pandas.core.common as com
29-
from pandas.core.sorting import safe_sort
3029
import pandas.util.testing as tm
3130

3231

@@ -309,7 +308,7 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):
309308
labels, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel)
310309
if sort:
311310
expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp)
312-
expected_uniques = safe_sort(uniques)
311+
expected_uniques = algos.safe_sort(uniques)
313312
else:
314313
expected_labels = np.array([0, 1, na_sentinel, 0], dtype=np.intp)
315314
expected_uniques = uniques

0 commit comments

Comments
 (0)