Skip to content

REF: move safe_sort to algos to avoid private/circular dependencies #29384

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 136 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from pandas.core.dtypes.cast import (
construct_1d_object_array_from_listlike,
infer_dtype_from_array,
maybe_promote,
)
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -639,8 +640,6 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
)

if sort and len(uniques) > 0:
from pandas.core.sorting import safe_sort

uniques, labels = safe_sort(
uniques, labels, na_sentinel=na_sentinel, assume_unique=True, verify=False
)
Expand Down Expand Up @@ -1920,3 +1919,138 @@ def diff(arr, n: int, axis: int = 0):
out_arr = out_arr.astype("int64").view("timedelta64[ns]")

return out_arr


# --------------------------------------------------------------------
# Helper functions

# Note: safe_sort is in algorithms.py instead of sorting.py because it is
# low-dependency, is used in this module, and used private methods from
# this module.
def safe_sort(
values,
labels=None,
na_sentinel: int = -1,
assume_unique: bool = False,
verify: bool = True,
):
"""
Sort ``values`` and reorder corresponding ``labels``.
``values`` should be unique if ``labels`` is not None.
Safe for use with mixed types (int, str), orders ints before strs.

Parameters
----------
values : list-like
Sequence; must be unique if ``labels`` is not None.
labels : list_like
Indices to ``values``. All out of bound indices are treated as
"not found" and will be masked with ``na_sentinel``.
na_sentinel : int, default -1
Value in ``labels`` to mark "not found".
Ignored when ``labels`` is None.
assume_unique : bool, default False
When True, ``values`` are assumed to be unique, which can speed up
the calculation. Ignored when ``labels`` is None.
verify : bool, default True
Check if labels are out of bound for the values and put out of bound
labels equal to na_sentinel. If ``verify=False``, it is assumed there
are no out of bound labels. Ignored when ``labels`` is None.

.. versionadded:: 0.25.0

Returns
-------
ordered : ndarray
Sorted ``values``
new_labels : ndarray
Reordered ``labels``; returned when ``labels`` is not None.

Raises
------
TypeError
* If ``values`` is not list-like or if ``labels`` is neither None
nor list-like
* If ``values`` cannot be sorted
ValueError
* If ``labels`` is not None and ``values`` contain duplicates.
"""
if not is_list_like(values):
raise TypeError(
"Only list-like objects are allowed to be passed to safe_sort as values"
)

if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values):
# don't convert to string types
dtype, _ = infer_dtype_from_array(values)
values = np.asarray(values, dtype=dtype)

def sort_mixed(values):
# order ints before strings, safe in py3
str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
nums = np.sort(values[~str_pos])
strs = np.sort(values[str_pos])
return np.concatenate([nums, np.asarray(strs, dtype=object)])

sorter = None
if (
not is_extension_array_dtype(values)
and lib.infer_dtype(values, skipna=False) == "mixed-integer"
):
# unorderable in py3 if mixed str/int
ordered = sort_mixed(values)
else:
try:
sorter = values.argsort()
ordered = values.take(sorter)
except TypeError:
# try this anyway
ordered = sort_mixed(values)

# labels:

if labels is None:
return ordered

if not is_list_like(labels):
raise TypeError(
"Only list-like objects or None are allowed to be"
"passed to safe_sort as labels"
)
labels = ensure_platform_int(np.asarray(labels))

from pandas import Index

if not assume_unique and not Index(values).is_unique:
raise ValueError("values should be unique if labels is not None")

if sorter is None:
# mixed types
hash_klass, values = _get_data_algo(values)
t = hash_klass(len(values))
t.map_locations(values)
sorter = ensure_platform_int(t.lookup(ordered))

if na_sentinel == -1:
# take_1d is faster, but only works for na_sentinels of -1
order2 = sorter.argsort()
new_labels = take_1d(order2, labels, fill_value=-1)
if verify:
mask = (labels < -len(values)) | (labels >= len(values))
else:
mask = None
else:
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
reverse_indexer.put(sorter, np.arange(len(sorter)))
# Out of bound indices will be masked with `na_sentinel` next, so we
# may deal with them here without performance loss using `mode='wrap'`
new_labels = reverse_indexer.take(labels, mode="wrap")

mask = labels == na_sentinel
if verify:
mask = mask | (labels < -len(values)) | (labels >= len(values))

if mask is not None:
np.putmask(new_labels, mask, na_sentinel)

return ordered, ensure_platform_int(new_labels)
9 changes: 4 additions & 5 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@
import pandas.core.missing as missing
from pandas.core.ops import get_op_result_name
from pandas.core.ops.invalid import make_invalid_op
import pandas.core.sorting as sorting
from pandas.core.strings import StringMethods

from pandas.io.formats.printing import (
Expand Down Expand Up @@ -2507,7 +2506,7 @@ def _union(self, other, sort):

if sort is None:
try:
result = sorting.safe_sort(result)
result = algos.safe_sort(result)
except TypeError as e:
warnings.warn(
"{}, sort order is undefined for "
Expand Down Expand Up @@ -2603,7 +2602,7 @@ def intersection(self, other, sort=False):
taken = other.take(indexer)

if sort is None:
taken = sorting.safe_sort(taken.values)
taken = algos.safe_sort(taken.values)
if self.name != other.name:
name = None
else:
Expand Down Expand Up @@ -2673,7 +2672,7 @@ def difference(self, other, sort=None):
the_diff = this.values.take(label_diff)
if sort is None:
try:
the_diff = sorting.safe_sort(the_diff)
the_diff = algos.safe_sort(the_diff)
except TypeError:
pass

Expand Down Expand Up @@ -2750,7 +2749,7 @@ def symmetric_difference(self, other, result_name=None, sort=None):
the_diff = concat_compat([left_diff, right_diff])
if sort is None:
try:
the_diff = sorting.safe_sort(the_diff)
the_diff = algos.safe_sort(the_diff)
except TypeError:
pass

Expand Down
3 changes: 1 addition & 2 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
import pandas.core.common as com
from pandas.core.frame import _merge_doc
from pandas.core.internals import _transform_index, concatenate_block_managers
import pandas.core.sorting as sorting
from pandas.core.sorting import is_int64_overflow_possible


Expand Down Expand Up @@ -1912,7 +1911,7 @@ def _sort_labels(uniques, left, right):
llength = len(left)
labels = np.concatenate([left, right])

_, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1)
_, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1)
new_labels = ensure_int64(new_labels)
new_left, new_right = new_labels[:llength], new_labels[llength:]

Expand Down
131 changes: 0 additions & 131 deletions pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,11 @@
from pandas._libs import algos, hashtable, lib
from pandas._libs.hashtable import unique_label_indices

from pandas.core.dtypes.cast import infer_dtype_from_array
from pandas.core.dtypes.common import (
ensure_int64,
ensure_platform_int,
is_categorical_dtype,
is_extension_array_dtype,
is_list_like,
)
from pandas.core.dtypes.missing import isna

Expand Down Expand Up @@ -389,132 +387,3 @@ def _reorder_by_uniques(uniques, labels):
uniques = algorithms.take_nd(uniques, sorter, allow_fill=False)

return uniques, labels


def safe_sort(
values,
labels=None,
na_sentinel: int = -1,
assume_unique: bool = False,
verify: bool = True,
):
"""
Sort ``values`` and reorder corresponding ``labels``.
``values`` should be unique if ``labels`` is not None.
Safe for use with mixed types (int, str), orders ints before strs.

Parameters
----------
values : list-like
Sequence; must be unique if ``labels`` is not None.
labels : list_like
Indices to ``values``. All out of bound indices are treated as
"not found" and will be masked with ``na_sentinel``.
na_sentinel : int, default -1
Value in ``labels`` to mark "not found".
Ignored when ``labels`` is None.
assume_unique : bool, default False
When True, ``values`` are assumed to be unique, which can speed up
the calculation. Ignored when ``labels`` is None.
verify : bool, default True
Check if labels are out of bound for the values and put out of bound
labels equal to na_sentinel. If ``verify=False``, it is assumed there
are no out of bound labels. Ignored when ``labels`` is None.

.. versionadded:: 0.25.0

Returns
-------
ordered : ndarray
Sorted ``values``
new_labels : ndarray
Reordered ``labels``; returned when ``labels`` is not None.

Raises
------
TypeError
* If ``values`` is not list-like or if ``labels`` is neither None
nor list-like
* If ``values`` cannot be sorted
ValueError
* If ``labels`` is not None and ``values`` contain duplicates.
"""
if not is_list_like(values):
raise TypeError(
"Only list-like objects are allowed to be passed to safe_sort as values"
)

if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values):
# don't convert to string types
dtype, _ = infer_dtype_from_array(values)
values = np.asarray(values, dtype=dtype)

def sort_mixed(values):
# order ints before strings, safe in py3
str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
nums = np.sort(values[~str_pos])
strs = np.sort(values[str_pos])
return np.concatenate([nums, np.asarray(strs, dtype=object)])

sorter = None
if (
not is_extension_array_dtype(values)
and lib.infer_dtype(values, skipna=False) == "mixed-integer"
):
# unorderable in py3 if mixed str/int
ordered = sort_mixed(values)
else:
try:
sorter = values.argsort()
ordered = values.take(sorter)
except TypeError:
# try this anyway
ordered = sort_mixed(values)

# labels:

if labels is None:
return ordered

if not is_list_like(labels):
raise TypeError(
"Only list-like objects or None are allowed to be"
"passed to safe_sort as labels"
)
labels = ensure_platform_int(np.asarray(labels))

from pandas import Index

if not assume_unique and not Index(values).is_unique:
raise ValueError("values should be unique if labels is not None")

if sorter is None:
# mixed types
hash_klass, values = algorithms._get_data_algo(values)
t = hash_klass(len(values))
t.map_locations(values)
sorter = ensure_platform_int(t.lookup(ordered))

if na_sentinel == -1:
# take_1d is faster, but only works for na_sentinels of -1
order2 = sorter.argsort()
new_labels = algorithms.take_1d(order2, labels, fill_value=-1)
if verify:
mask = (labels < -len(values)) | (labels >= len(values))
else:
mask = None
else:
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
reverse_indexer.put(sorter, np.arange(len(sorter)))
# Out of bound indices will be masked with `na_sentinel` next, so we
# may deal with them here without performance loss using `mode='wrap'`
new_labels = reverse_indexer.take(labels, mode="wrap")

mask = labels == na_sentinel
if verify:
mask = mask | (labels < -len(values)) | (labels >= len(values))

if mask is not None:
np.putmask(new_labels, mask, na_sentinel)

return ordered, ensure_platform_int(new_labels)
2 changes: 1 addition & 1 deletion pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@
isna,
period_range,
)
from pandas.core.algorithms import safe_sort
from pandas.core.index import (
_get_combined_index,
ensure_index,
ensure_index_from_sequences,
)
from pandas.core.indexes.api import Index, MultiIndex
from pandas.core.sorting import safe_sort
from pandas.tests.indexes.common import Base
from pandas.tests.indexes.conftest import indices_dict
import pandas.util.testing as tm
Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
import pandas.core.algorithms as algos
from pandas.core.arrays import DatetimeArray
import pandas.core.common as com
from pandas.core.sorting import safe_sort
import pandas.util.testing as tm


Expand Down Expand Up @@ -309,7 +308,7 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):
labels, uniques = algos.factorize(data, sort=sort, na_sentinel=na_sentinel)
if sort:
expected_labels = np.array([1, 0, na_sentinel, 1], dtype=np.intp)
expected_uniques = safe_sort(uniques)
expected_uniques = algos.safe_sort(uniques)
else:
expected_labels = np.array([0, 1, na_sentinel, 0], dtype=np.intp)
expected_uniques = uniques
Expand Down
Loading