Skip to content

Commit 5519bcc

Browse files
topper-123Mateusz Górski
authored and
Mateusz Górski
committed
REF: rename labels to codes in safe_sort and _factorize (pandas-dev#29552)
1 parent b53523f commit 5519bcc

File tree

2 files changed

+74
-73
lines changed

2 files changed

+74
-73
lines changed

pandas/core/algorithms.py

+38-37
Original file line numberDiff line numberDiff line change
@@ -448,9 +448,11 @@ def isin(comps, values) -> np.ndarray:
448448
return f(comps, values)
449449

450450

451-
def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=None):
451+
def _factorize_array(
452+
values, na_sentinel: int = -1, size_hint=None, na_value=None
453+
) -> Tuple[np.ndarray, np.ndarray]:
452454
"""
453-
Factorize an array-like to labels and uniques.
455+
Factorize an array-like to codes and uniques.
454456
455457
This doesn't do any coercion of types or unboxing before factorization.
456458
@@ -468,18 +470,16 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
468470
469471
Returns
470472
-------
471-
labels : ndarray
473+
codes : ndarray
472474
uniques : ndarray
473475
"""
474476
hash_klass, values = _get_data_algo(values)
475477

476478
table = hash_klass(size_hint or len(values))
477-
uniques, labels = table.factorize(
478-
values, na_sentinel=na_sentinel, na_value=na_value
479-
)
479+
uniques, codes = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value)
480480

481-
labels = ensure_platform_int(labels)
482-
return labels, uniques
481+
codes = ensure_platform_int(codes)
482+
return codes, uniques
483483

484484

485485
_shared_docs[
@@ -1924,51 +1924,52 @@ def diff(arr, n: int, axis: int = 0):
19241924
# this module.
19251925
def safe_sort(
19261926
values,
1927-
labels=None,
1927+
codes=None,
19281928
na_sentinel: int = -1,
19291929
assume_unique: bool = False,
19301930
verify: bool = True,
1931-
):
1931+
) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
19321932
"""
1933-
Sort ``values`` and reorder corresponding ``labels``.
1934-
``values`` should be unique if ``labels`` is not None.
1933+
Sort ``values`` and reorder corresponding ``codes``.
1934+
1935+
``values`` should be unique if ``codes`` is not None.
19351936
Safe for use with mixed types (int, str), orders ints before strs.
19361937
19371938
Parameters
19381939
----------
19391940
values : list-like
1940-
Sequence; must be unique if ``labels`` is not None.
1941-
labels : list_like
1941+
Sequence; must be unique if ``codes`` is not None.
1942+
codes : list_like, optional
19421943
Indices to ``values``. All out of bound indices are treated as
19431944
"not found" and will be masked with ``na_sentinel``.
19441945
na_sentinel : int, default -1
1945-
Value in ``labels`` to mark "not found".
1946-
Ignored when ``labels`` is None.
1946+
Value in ``codes`` to mark "not found".
1947+
Ignored when ``codes`` is None.
19471948
assume_unique : bool, default False
19481949
When True, ``values`` are assumed to be unique, which can speed up
1949-
the calculation. Ignored when ``labels`` is None.
1950+
the calculation. Ignored when ``codes`` is None.
19501951
verify : bool, default True
1951-
Check if labels are out of bound for the values and put out of bound
1952-
labels equal to na_sentinel. If ``verify=False``, it is assumed there
1953-
are no out of bound labels. Ignored when ``labels`` is None.
1952+
Check if codes are out of bound for the values and put out of bound
1953+
codes equal to na_sentinel. If ``verify=False``, it is assumed there
1954+
are no out of bound codes. Ignored when ``codes`` is None.
19541955
19551956
.. versionadded:: 0.25.0
19561957
19571958
Returns
19581959
-------
19591960
ordered : ndarray
19601961
Sorted ``values``
1961-
new_labels : ndarray
1962-
Reordered ``labels``; returned when ``labels`` is not None.
1962+
new_codes : ndarray
1963+
Reordered ``codes``; returned when ``codes`` is not None.
19631964
19641965
Raises
19651966
------
19661967
TypeError
1967-
* If ``values`` is not list-like or if ``labels`` is neither None
1968+
* If ``values`` is not list-like or if ``codes`` is neither None
19681969
nor list-like
19691970
* If ``values`` cannot be sorted
19701971
ValueError
1971-
* If ``labels`` is not None and ``values`` contain duplicates.
1972+
* If ``codes`` is not None and ``values`` contain duplicates.
19721973
"""
19731974
if not is_list_like(values):
19741975
raise TypeError(
@@ -2002,22 +2003,22 @@ def sort_mixed(values):
20022003
# try this anyway
20032004
ordered = sort_mixed(values)
20042005

2005-
# labels:
2006+
# codes:
20062007

2007-
if labels is None:
2008+
if codes is None:
20082009
return ordered
20092010

2010-
if not is_list_like(labels):
2011+
if not is_list_like(codes):
20112012
raise TypeError(
20122013
"Only list-like objects or None are allowed to be"
2013-
"passed to safe_sort as labels"
2014+
"passed to safe_sort as codes"
20142015
)
2015-
labels = ensure_platform_int(np.asarray(labels))
2016+
codes = ensure_platform_int(np.asarray(codes))
20162017

20172018
from pandas import Index
20182019

20192020
if not assume_unique and not Index(values).is_unique:
2020-
raise ValueError("values should be unique if labels is not None")
2021+
raise ValueError("values should be unique if codes is not None")
20212022

20222023
if sorter is None:
20232024
# mixed types
@@ -2029,23 +2030,23 @@ def sort_mixed(values):
20292030
if na_sentinel == -1:
20302031
# take_1d is faster, but only works for na_sentinels of -1
20312032
order2 = sorter.argsort()
2032-
new_labels = take_1d(order2, labels, fill_value=-1)
2033+
new_codes = take_1d(order2, codes, fill_value=-1)
20332034
if verify:
2034-
mask = (labels < -len(values)) | (labels >= len(values))
2035+
mask = (codes < -len(values)) | (codes >= len(values))
20352036
else:
20362037
mask = None
20372038
else:
20382039
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
20392040
reverse_indexer.put(sorter, np.arange(len(sorter)))
20402041
# Out of bound indices will be masked with `na_sentinel` next, so we
20412042
# may deal with them here without performance loss using `mode='wrap'`
2042-
new_labels = reverse_indexer.take(labels, mode="wrap")
2043+
new_codes = reverse_indexer.take(codes, mode="wrap")
20432044

2044-
mask = labels == na_sentinel
2045+
mask = codes == na_sentinel
20452046
if verify:
2046-
mask = mask | (labels < -len(values)) | (labels >= len(values))
2047+
mask = mask | (codes < -len(values)) | (codes >= len(values))
20472048

20482049
if mask is not None:
2049-
np.putmask(new_labels, mask, na_sentinel)
2050+
np.putmask(new_codes, mask, na_sentinel)
20502051

2051-
return ordered, ensure_platform_int(new_labels)
2052+
return ordered, ensure_platform_int(new_codes)

pandas/tests/test_sorting.py

+36-36
Original file line numberDiff line numberDiff line change
@@ -314,27 +314,27 @@ def verify_order(df):
314314

315315

316316
def test_decons():
317-
def testit(label_list, shape):
318-
group_index = get_group_index(label_list, shape, sort=True, xnull=True)
319-
label_list2 = decons_group_index(group_index, shape)
317+
def testit(codes_list, shape):
318+
group_index = get_group_index(codes_list, shape, sort=True, xnull=True)
319+
codes_list2 = decons_group_index(group_index, shape)
320320

321-
for a, b in zip(label_list, label_list2):
321+
for a, b in zip(codes_list, codes_list2):
322322
tm.assert_numpy_array_equal(a, b)
323323

324324
shape = (4, 5, 6)
325-
label_list = [
325+
codes_list = [
326326
np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64),
327327
np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64),
328328
np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64),
329329
]
330-
testit(label_list, shape)
330+
testit(codes_list, shape)
331331

332332
shape = (10000, 10000)
333-
label_list = [
333+
codes_list = [
334334
np.tile(np.arange(10000, dtype=np.int64), 5),
335335
np.tile(np.arange(10000, dtype=np.int64), 5),
336336
]
337-
testit(label_list, shape)
337+
testit(codes_list, shape)
338338

339339

340340
class TestSafeSort:
@@ -355,42 +355,42 @@ def test_basic_sort(self):
355355
tm.assert_numpy_array_equal(result, expected)
356356

357357
@pytest.mark.parametrize("verify", [True, False])
358-
def test_labels(self, verify):
358+
def test_codes(self, verify):
359359
values = [3, 1, 2, 0, 4]
360360
expected = np.array([0, 1, 2, 3, 4])
361361

362-
labels = [0, 1, 1, 2, 3, 0, -1, 4]
363-
result, result_labels = safe_sort(values, labels, verify=verify)
364-
expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp)
362+
codes = [0, 1, 1, 2, 3, 0, -1, 4]
363+
result, result_codes = safe_sort(values, codes, verify=verify)
364+
expected_codes = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp)
365365
tm.assert_numpy_array_equal(result, expected)
366-
tm.assert_numpy_array_equal(result_labels, expected_labels)
366+
tm.assert_numpy_array_equal(result_codes, expected_codes)
367367

368368
# na_sentinel
369-
labels = [0, 1, 1, 2, 3, 0, 99, 4]
370-
result, result_labels = safe_sort(values, labels, na_sentinel=99, verify=verify)
371-
expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp)
369+
codes = [0, 1, 1, 2, 3, 0, 99, 4]
370+
result, result_codes = safe_sort(values, codes, na_sentinel=99, verify=verify)
371+
expected_codes = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp)
372372
tm.assert_numpy_array_equal(result, expected)
373-
tm.assert_numpy_array_equal(result_labels, expected_labels)
373+
tm.assert_numpy_array_equal(result_codes, expected_codes)
374374

375-
labels = []
376-
result, result_labels = safe_sort(values, labels, verify=verify)
377-
expected_labels = np.array([], dtype=np.intp)
375+
codes = []
376+
result, result_codes = safe_sort(values, codes, verify=verify)
377+
expected_codes = np.array([], dtype=np.intp)
378378
tm.assert_numpy_array_equal(result, expected)
379-
tm.assert_numpy_array_equal(result_labels, expected_labels)
379+
tm.assert_numpy_array_equal(result_codes, expected_codes)
380380

381381
@pytest.mark.parametrize("na_sentinel", [-1, 99])
382-
def test_labels_out_of_bound(self, na_sentinel):
382+
def test_codes_out_of_bound(self, na_sentinel):
383383
values = [3, 1, 2, 0, 4]
384384
expected = np.array([0, 1, 2, 3, 4])
385385

386386
# out of bound indices
387-
labels = [0, 101, 102, 2, 3, 0, 99, 4]
388-
result, result_labels = safe_sort(values, labels, na_sentinel=na_sentinel)
389-
expected_labels = np.array(
387+
codes = [0, 101, 102, 2, 3, 0, 99, 4]
388+
result, result_codes = safe_sort(values, codes, na_sentinel=na_sentinel)
389+
expected_codes = np.array(
390390
[3, na_sentinel, na_sentinel, 2, 0, 3, na_sentinel, 4], dtype=np.intp
391391
)
392392
tm.assert_numpy_array_equal(result, expected)
393-
tm.assert_numpy_array_equal(result_labels, expected_labels)
393+
tm.assert_numpy_array_equal(result_codes, expected_codes)
394394

395395
def test_mixed_integer(self):
396396
values = np.array(["b", 1, 0, "a", 0, "b"], dtype=object)
@@ -399,12 +399,12 @@ def test_mixed_integer(self):
399399
tm.assert_numpy_array_equal(result, expected)
400400

401401
values = np.array(["b", 1, 0, "a"], dtype=object)
402-
labels = [0, 1, 2, 3, 0, -1, 1]
403-
result, result_labels = safe_sort(values, labels)
402+
codes = [0, 1, 2, 3, 0, -1, 1]
403+
result, result_codes = safe_sort(values, codes)
404404
expected = np.array([0, 1, "a", "b"], dtype=object)
405-
expected_labels = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp)
405+
expected_codes = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp)
406406
tm.assert_numpy_array_equal(result, expected)
407-
tm.assert_numpy_array_equal(result_labels, expected_labels)
407+
tm.assert_numpy_array_equal(result_codes, expected_codes)
408408

409409
def test_mixed_integer_from_list(self):
410410
values = ["b", 1, 0, "a", 0, "b"]
@@ -428,10 +428,10 @@ def test_exceptions(self):
428428
safe_sort(values=1)
429429

430430
with pytest.raises(TypeError, match="Only list-like objects or None"):
431-
safe_sort(values=[0, 1, 2], labels=1)
431+
safe_sort(values=[0, 1, 2], codes=1)
432432

433433
with pytest.raises(ValueError, match="values should be unique"):
434-
safe_sort(values=[0, 1, 2, 1], labels=[0, 1])
434+
safe_sort(values=[0, 1, 2, 1], codes=[0, 1])
435435

436436
def test_extension_array(self):
437437
# a = array([1, 3, np.nan, 2], dtype='Int64')
@@ -443,12 +443,12 @@ def test_extension_array(self):
443443

444444
@pytest.mark.parametrize("verify", [True, False])
445445
@pytest.mark.parametrize("na_sentinel", [-1, 99])
446-
def test_extension_array_labels(self, verify, na_sentinel):
446+
def test_extension_array_codes(self, verify, na_sentinel):
447447
a = array([1, 3, 2], dtype="Int64")
448-
result, labels = safe_sort(
448+
result, codes = safe_sort(
449449
a, [0, 1, na_sentinel, 2], na_sentinel=na_sentinel, verify=verify
450450
)
451451
expected_values = array([1, 2, 3], dtype="Int64")
452-
expected_labels = np.array([0, 2, na_sentinel, 1], dtype=np.intp)
452+
expected_codes = np.array([0, 2, na_sentinel, 1], dtype=np.intp)
453453
tm.assert_extension_array_equal(result, expected_values)
454-
tm.assert_numpy_array_equal(labels, expected_labels)
454+
tm.assert_numpy_array_equal(codes, expected_codes)

0 commit comments

Comments
 (0)