3
3
intended for public consumption
4
4
"""
5
5
from textwrap import dedent
6
- from typing import Dict
6
+ from typing import Dict , Optional , Union , Tuple
7
7
from warnings import catch_warnings , simplefilter , warn
8
8
9
9
import numpy as np
@@ -501,9 +501,9 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
501
501
502
502
Returns
503
503
-------
504
- labels : ndarray
504
+ codes : ndarray
505
505
An integer ndarray that's an indexer into `uniques`.
506
- ``uniques.take(labels )`` will have the same values as `values`.
506
+ ``uniques.take(codes )`` will have the same values as `values`.
507
507
uniques : ndarray, Index, or Categorical
508
508
The unique valid values. When `values` is Categorical, `uniques`
509
509
is a Categorical. When `values` is some other pandas object, an
@@ -525,27 +525,27 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
525
525
``pd.factorize(values)``. The results are identical for methods like
526
526
:meth:`Series.factorize`.
527
527
528
- >>> labels , uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
529
- >>> labels
528
+ >>> codes , uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
529
+ >>> codes
530
530
array([0, 0, 1, 2, 0])
531
531
>>> uniques
532
532
array(['b', 'a', 'c'], dtype=object)
533
533
534
- With ``sort=True``, the `uniques` will be sorted, and `labels ` will be
534
+ With ``sort=True``, the `uniques` will be sorted, and `codes ` will be
535
535
shuffled so that the relationship is the maintained.
536
536
537
- >>> labels , uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
538
- >>> labels
537
+ >>> codes , uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
538
+ >>> codes
539
539
array([1, 1, 0, 2, 1])
540
540
>>> uniques
541
541
array(['a', 'b', 'c'], dtype=object)
542
542
543
- Missing values are indicated in `labels ` with `na_sentinel`
543
+ Missing values are indicated in `codes ` with `na_sentinel`
544
544
(``-1`` by default). Note that missing values are never
545
545
included in `uniques`.
546
546
547
- >>> labels , uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
548
- >>> labels
547
+ >>> codes , uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
548
+ >>> codes
549
549
array([ 0, -1, 1, 2, 0])
550
550
>>> uniques
551
551
array(['b', 'a', 'c'], dtype=object)
@@ -555,8 +555,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
555
555
will differ. For Categoricals, a `Categorical` is returned.
556
556
557
557
>>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
558
- >>> labels , uniques = pd.factorize(cat)
559
- >>> labels
558
+ >>> codes , uniques = pd.factorize(cat)
559
+ >>> codes
560
560
array([0, 0, 1])
561
561
>>> uniques
562
562
[a, c]
@@ -569,8 +569,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
569
569
returned.
570
570
571
571
>>> cat = pd.Series(['a', 'a', 'c'])
572
- >>> labels , uniques = pd.factorize(cat)
573
- >>> labels
572
+ >>> codes , uniques = pd.factorize(cat)
573
+ >>> codes
574
574
array([0, 0, 1])
575
575
>>> uniques
576
576
Index(['a', 'c'], dtype='object')
@@ -596,7 +596,7 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
596
596
sort = dedent (
597
597
"""\
598
598
sort : bool, default False
599
- Sort `uniques` and shuffle `labels ` to maintain the
599
+ Sort `uniques` and shuffle `codes ` to maintain the
600
600
relationship.
601
601
"""
602
602
),
@@ -609,11 +609,17 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
609
609
)
610
610
@Appender (_shared_docs ["factorize" ])
611
611
@deprecate_kwarg (old_arg_name = "order" , new_arg_name = None )
612
- def factorize (values , sort : bool = False , order = None , na_sentinel = - 1 , size_hint = None ):
612
+ def factorize (
613
+ values ,
614
+ sort : bool = False ,
615
+ order = None ,
616
+ na_sentinel : int = - 1 ,
617
+ size_hint : Optional [int ] = None ,
618
+ ) -> Tuple [np .ndarray , Union [np .ndarray , ABCIndex ]]:
613
619
# Implementation notes: This method is responsible for 3 things
614
620
# 1.) coercing data to array-like (ndarray, Index, extension array)
615
- # 2.) factorizing labels and uniques
616
- # 3.) Maybe boxing the output in an Index
621
+ # 2.) factorizing codes and uniques
622
+ # 3.) Maybe boxing the uniques in an Index
617
623
#
618
624
# Step 2 is dispatched to extension types (like Categorical). They are
619
625
# responsible only for factorization. All data coercion, sorting and boxing
@@ -624,7 +630,7 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
624
630
625
631
if is_extension_array_dtype (values ):
626
632
values = extract_array (values )
627
- labels , uniques = values .factorize (na_sentinel = na_sentinel )
633
+ codes , uniques = values .factorize (na_sentinel = na_sentinel )
628
634
dtype = original .dtype
629
635
else :
630
636
values , dtype = _ensure_data (values )
@@ -634,13 +640,13 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
634
640
else :
635
641
na_value = None
636
642
637
- labels , uniques = _factorize_array (
643
+ codes , uniques = _factorize_array (
638
644
values , na_sentinel = na_sentinel , size_hint = size_hint , na_value = na_value
639
645
)
640
646
641
647
if sort and len (uniques ) > 0 :
642
- uniques , labels = safe_sort (
643
- uniques , labels , na_sentinel = na_sentinel , assume_unique = True , verify = False
648
+ uniques , codes = safe_sort (
649
+ uniques , codes , na_sentinel = na_sentinel , assume_unique = True , verify = False
644
650
)
645
651
646
652
uniques = _reconstruct_data (uniques , dtype , original )
@@ -653,7 +659,7 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
653
659
654
660
uniques = Index (uniques )
655
661
656
- return labels , uniques
662
+ return codes , uniques
657
663
658
664
659
665
def value_counts (
0 commit comments