@@ -708,7 +708,66 @@ def factorize(
708
708
>>> uniques
709
709
array(['b', 'a', 'c'], dtype=object)
710
710
711
- # ... (rest of the examples remain unchanged)
711
+ With ``sort=True``, the `uniques` will be sorted, and `codes` will be
712
+ shuffled so that the relationship is the maintained.
713
+
714
+ >>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"),
715
+ ... sort=True)
716
+ >>> codes
717
+ array([1, 1, 0, 2, 1])
718
+ >>> uniques
719
+ array(['a', 'b', 'c'], dtype=object)
720
+
721
+ When ``use_na_sentinel=True`` (the default), missing values are indicated in
722
+ the `codes` with the sentinel value ``-1`` and missing values are not
723
+ included in `uniques`.
724
+
725
+ >>> codes, uniques = pd.factorize(np.array(['b', None, 'a', 'c', 'b'], dtype="O"))
726
+ >>> codes
727
+ array([ 0, -1, 1, 2, 0])
728
+ >>> uniques
729
+ array(['b', 'a', 'c'], dtype=object)
730
+
731
+ Thus far, we've only factorized lists (which are internally coerced to
732
+ NumPy arrays). When factorizing pandas objects, the type of `uniques`
733
+ will differ. For Categoricals, a `Categorical` is returned.
734
+
735
+ >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
736
+ >>> codes, uniques = pd.factorize(cat)
737
+ >>> codes
738
+ array([0, 0, 1])
739
+ >>> uniques
740
+ ['a', 'c']
741
+ Categories (3, object): ['a', 'b', 'c']
742
+
743
+ Notice that ``'b'`` is in ``uniques.categories``, despite not being
744
+ present in ``cat.values``.
745
+
746
+ For all other pandas objects, an Index of the appropriate type is
747
+ returned.
748
+
749
+ >>> cat = pd.Series(['a', 'a', 'c'])
750
+ >>> codes, uniques = pd.factorize(cat)
751
+ >>> codes
752
+ array([0, 0, 1])
753
+ >>> uniques
754
+ Index(['a', 'c'], dtype='object')
755
+
756
+ If NaN is in the values, and we want to include NaN in the uniques of the
757
+ values, it can be achieved by setting ``use_na_sentinel=False``.
758
+
759
+ >>> values = np.array([1, 2, 1, np.nan])
760
+ >>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True
761
+ >>> codes
762
+ array([ 0, 1, 0, -1])
763
+ >>> uniques
764
+ array([1., 2.])
765
+
766
+ >>> codes, uniques = pd.factorize(values, use_na_sentinel=False)
767
+ >>> codes
768
+ array([0, 1, 0, 2])
769
+ >>> uniques
770
+ array([ 1., 2., nan])
712
771
"""
713
772
# Implementation notes: This method is responsible for 3 things
714
773
# 1.) coercing data to array-like (ndarray, Index, extension array)
0 commit comments