Skip to content

Commit e561293

Browse files
topper-123jreback
authored andcommitted
API: rename labels to codes (#29509)
1 parent bcbc468 commit e561293

File tree

8 files changed

+133
-127
lines changed

8 files changed

+133
-127
lines changed

pandas/core/algorithms.py

+30-24
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
intended for public consumption
44
"""
55
from textwrap import dedent
6-
from typing import Dict
6+
from typing import Dict, Optional, Tuple, Union
77
from warnings import catch_warnings, simplefilter, warn
88

99
import numpy as np
@@ -501,9 +501,9 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
501501
502502
Returns
503503
-------
504-
labels : ndarray
504+
codes : ndarray
505505
An integer ndarray that's an indexer into `uniques`.
506-
``uniques.take(labels)`` will have the same values as `values`.
506+
``uniques.take(codes)`` will have the same values as `values`.
507507
uniques : ndarray, Index, or Categorical
508508
The unique valid values. When `values` is Categorical, `uniques`
509509
is a Categorical. When `values` is some other pandas object, an
@@ -525,27 +525,27 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
525525
``pd.factorize(values)``. The results are identical for methods like
526526
:meth:`Series.factorize`.
527527
528-
>>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
529-
>>> labels
528+
>>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
529+
>>> codes
530530
array([0, 0, 1, 2, 0])
531531
>>> uniques
532532
array(['b', 'a', 'c'], dtype=object)
533533
534-
With ``sort=True``, the `uniques` will be sorted, and `labels` will be
534+
With ``sort=True``, the `uniques` will be sorted, and `codes` will be
535535
shuffled so that the relationship is the maintained.
536536
537-
>>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
538-
>>> labels
537+
>>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
538+
>>> codes
539539
array([1, 1, 0, 2, 1])
540540
>>> uniques
541541
array(['a', 'b', 'c'], dtype=object)
542542
543-
Missing values are indicated in `labels` with `na_sentinel`
543+
Missing values are indicated in `codes` with `na_sentinel`
544544
(``-1`` by default). Note that missing values are never
545545
included in `uniques`.
546546
547-
>>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
548-
>>> labels
547+
>>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
548+
>>> codes
549549
array([ 0, -1, 1, 2, 0])
550550
>>> uniques
551551
array(['b', 'a', 'c'], dtype=object)
@@ -555,8 +555,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
555555
will differ. For Categoricals, a `Categorical` is returned.
556556
557557
>>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
558-
>>> labels, uniques = pd.factorize(cat)
559-
>>> labels
558+
>>> codes, uniques = pd.factorize(cat)
559+
>>> codes
560560
array([0, 0, 1])
561561
>>> uniques
562562
[a, c]
@@ -569,8 +569,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
569569
returned.
570570
571571
>>> cat = pd.Series(['a', 'a', 'c'])
572-
>>> labels, uniques = pd.factorize(cat)
573-
>>> labels
572+
>>> codes, uniques = pd.factorize(cat)
573+
>>> codes
574574
array([0, 0, 1])
575575
>>> uniques
576576
Index(['a', 'c'], dtype='object')
@@ -596,7 +596,7 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
596596
sort=dedent(
597597
"""\
598598
sort : bool, default False
599-
Sort `uniques` and shuffle `labels` to maintain the
599+
Sort `uniques` and shuffle `codes` to maintain the
600600
relationship.
601601
"""
602602
),
@@ -609,11 +609,17 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
609609
)
610610
@Appender(_shared_docs["factorize"])
611611
@deprecate_kwarg(old_arg_name="order", new_arg_name=None)
612-
def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=None):
612+
def factorize(
613+
values,
614+
sort: bool = False,
615+
order=None,
616+
na_sentinel: int = -1,
617+
size_hint: Optional[int] = None,
618+
) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]:
613619
# Implementation notes: This method is responsible for 3 things
614620
# 1.) coercing data to array-like (ndarray, Index, extension array)
615-
# 2.) factorizing labels and uniques
616-
# 3.) Maybe boxing the output in an Index
621+
# 2.) factorizing codes and uniques
622+
# 3.) Maybe boxing the uniques in an Index
617623
#
618624
# Step 2 is dispatched to extension types (like Categorical). They are
619625
# responsible only for factorization. All data coercion, sorting and boxing
@@ -624,7 +630,7 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
624630

625631
if is_extension_array_dtype(values):
626632
values = extract_array(values)
627-
labels, uniques = values.factorize(na_sentinel=na_sentinel)
633+
codes, uniques = values.factorize(na_sentinel=na_sentinel)
628634
dtype = original.dtype
629635
else:
630636
values, dtype = _ensure_data(values)
@@ -634,13 +640,13 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
634640
else:
635641
na_value = None
636642

637-
labels, uniques = _factorize_array(
643+
codes, uniques = _factorize_array(
638644
values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value
639645
)
640646

641647
if sort and len(uniques) > 0:
642-
uniques, labels = safe_sort(
643-
uniques, labels, na_sentinel=na_sentinel, assume_unique=True, verify=False
648+
uniques, codes = safe_sort(
649+
uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
644650
)
645651

646652
uniques = _reconstruct_data(uniques, dtype, original)
@@ -653,7 +659,7 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
653659

654660
uniques = Index(uniques)
655661

656-
return labels, uniques
662+
return codes, uniques
657663

658664

659665
def value_counts(

pandas/core/arrays/base.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -690,11 +690,11 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArra
690690
Parameters
691691
----------
692692
na_sentinel : int, default -1
693-
Value to use in the `labels` array to indicate missing values.
693+
Value to use in the `codes` array to indicate missing values.
694694
695695
Returns
696696
-------
697-
labels : ndarray
697+
codes : ndarray
698698
An integer NumPy array that's an indexer into the original
699699
ExtensionArray.
700700
uniques : ExtensionArray
@@ -724,12 +724,12 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArra
724724
# Complete control over factorization.
725725
arr, na_value = self._values_for_factorize()
726726

727-
labels, uniques = _factorize_array(
727+
codes, uniques = _factorize_array(
728728
arr, na_sentinel=na_sentinel, na_value=na_value
729729
)
730730

731731
uniques = self._from_factorized(uniques, self)
732-
return labels, uniques
732+
return codes, uniques
733733

734734
_extension_array_shared_docs[
735735
"repeat"

pandas/core/arrays/sparse/array.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -710,11 +710,11 @@ def factorize(self, na_sentinel=-1):
710710
# Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
711711
# The sparsity on this is backwards from what Sparse would want. Want
712712
# ExtensionArray.factorize -> Tuple[EA, EA]
713-
# Given that we have to return a dense array of labels, why bother
713+
# Given that we have to return a dense array of codes, why bother
714714
# implementing an efficient factorize?
715-
labels, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
715+
codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
716716
uniques = SparseArray(uniques, dtype=self.dtype)
717-
return labels, uniques
717+
return codes, uniques
718718

719719
def value_counts(self, dropna=True):
720720
"""

pandas/core/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1518,7 +1518,7 @@ def memory_usage(self, deep=False):
15181518
sort=textwrap.dedent(
15191519
"""\
15201520
sort : bool, default False
1521-
Sort `uniques` and shuffle `labels` to maintain the
1521+
Sort `uniques` and shuffle `codes` to maintain the
15221522
relationship.
15231523
"""
15241524
),

pandas/tests/arrays/categorical/test_algos.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -11,23 +11,23 @@ def test_factorize(categories, ordered):
1111
cat = pd.Categorical(
1212
["b", "b", "a", "c", None], categories=categories, ordered=ordered
1313
)
14-
labels, uniques = pd.factorize(cat)
15-
expected_labels = np.array([0, 0, 1, 2, -1], dtype=np.intp)
14+
codes, uniques = pd.factorize(cat)
15+
expected_codes = np.array([0, 0, 1, 2, -1], dtype=np.intp)
1616
expected_uniques = pd.Categorical(
1717
["b", "a", "c"], categories=categories, ordered=ordered
1818
)
1919

20-
tm.assert_numpy_array_equal(labels, expected_labels)
20+
tm.assert_numpy_array_equal(codes, expected_codes)
2121
tm.assert_categorical_equal(uniques, expected_uniques)
2222

2323

2424
def test_factorized_sort():
2525
cat = pd.Categorical(["b", "b", None, "a"])
26-
labels, uniques = pd.factorize(cat, sort=True)
27-
expected_labels = np.array([1, 1, -1, 0], dtype=np.intp)
26+
codes, uniques = pd.factorize(cat, sort=True)
27+
expected_codes = np.array([1, 1, -1, 0], dtype=np.intp)
2828
expected_uniques = pd.Categorical(["a", "b"])
2929

30-
tm.assert_numpy_array_equal(labels, expected_labels)
30+
tm.assert_numpy_array_equal(codes, expected_codes)
3131
tm.assert_categorical_equal(uniques, expected_uniques)
3232

3333

@@ -36,13 +36,13 @@ def test_factorized_sort_ordered():
3636
["b", "b", None, "a"], categories=["c", "b", "a"], ordered=True
3737
)
3838

39-
labels, uniques = pd.factorize(cat, sort=True)
40-
expected_labels = np.array([0, 0, -1, 1], dtype=np.intp)
39+
codes, uniques = pd.factorize(cat, sort=True)
40+
expected_codes = np.array([0, 0, -1, 1], dtype=np.intp)
4141
expected_uniques = pd.Categorical(
4242
["b", "a"], categories=["c", "b", "a"], ordered=True
4343
)
4444

45-
tm.assert_numpy_array_equal(labels, expected_labels)
45+
tm.assert_numpy_array_equal(codes, expected_codes)
4646
tm.assert_categorical_equal(uniques, expected_uniques)
4747

4848

pandas/tests/extension/base/methods.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -113,29 +113,29 @@ def test_unique(self, data, box, method):
113113

114114
@pytest.mark.parametrize("na_sentinel", [-1, -2])
115115
def test_factorize(self, data_for_grouping, na_sentinel):
116-
labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
117-
expected_labels = np.array(
116+
codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
117+
expected_codes = np.array(
118118
[0, 0, na_sentinel, na_sentinel, 1, 1, 0, 2], dtype=np.intp
119119
)
120120
expected_uniques = data_for_grouping.take([0, 4, 7])
121121

122-
tm.assert_numpy_array_equal(labels, expected_labels)
122+
tm.assert_numpy_array_equal(codes, expected_codes)
123123
self.assert_extension_array_equal(uniques, expected_uniques)
124124

125125
@pytest.mark.parametrize("na_sentinel", [-1, -2])
126126
def test_factorize_equivalence(self, data_for_grouping, na_sentinel):
127-
l1, u1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
128-
l2, u2 = data_for_grouping.factorize(na_sentinel=na_sentinel)
127+
codes_1, uniques_1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
128+
codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel)
129129

130-
tm.assert_numpy_array_equal(l1, l2)
131-
self.assert_extension_array_equal(u1, u2)
130+
tm.assert_numpy_array_equal(codes_1, codes_2)
131+
self.assert_extension_array_equal(uniques_1, uniques_2)
132132

133133
def test_factorize_empty(self, data):
134-
labels, uniques = pd.factorize(data[:0])
135-
expected_labels = np.array([], dtype=np.intp)
134+
codes, uniques = pd.factorize(data[:0])
135+
expected_codes = np.array([], dtype=np.intp)
136136
expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)
137137

138-
tm.assert_numpy_array_equal(labels, expected_labels)
138+
tm.assert_numpy_array_equal(codes, expected_codes)
139139
self.assert_extension_array_equal(uniques, expected_uniques)
140140

141141
def test_fillna_copy_frame(self, data_missing):

0 commit comments

Comments
 (0)